0. Setting up the analysis directories & common variables and Loading necessary packages¶

In [1]:
# set work directory 
%pwd
%cd /scicore/home/schiera/liu0005/scMultiome_embryo/scenicplus_analysis
%pwd
/scicore/home/schiera/liu0005/scMultiome_embryo/scenicplus_analysis
Out[1]:
'/scicore/home/schiera/liu0005/scMultiome_embryo/scenicplus_analysis'
In [2]:
# supress warnings
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
import sys
import os
_stderr = sys.stderr
null = open(os.devnull,'wb')
In [3]:
# project directory
projDir = '/scicore/home/schiera/liu0005/scMultiome_embryo/scenicplus_analysis/'
In [4]:
# output directory
outDir = projDir + 'output/'
import os
if not os.path.exists(outDir):
    os.makedirs(outDir)
In [5]:
# temporary directory
tmpDir = '/scicore/home/schiera/liu0005/tmp/'
import os
if not os.path.exists(tmpDir):
    os.makedirs(tmpDir)
In [6]:
# necessary packages
import pickle
import os
import dill
import pandas as pd
import numpy as np
import pyranges as pr
import scanpy as sc
import time


from pycisTopic.cistopic_class import *
from pycisTopic.lda_models import *
from pycisTopic.topic_binarization import *
from pycisTopic.topic_qc import *
from pycisTopic.diff_features import *
from pycistarget.utils import region_names_to_coordinates
from scenicplus.wrappers.run_pycistarget import run_pycistarget
from scenicplus.scenicplus_class import create_SCENICPLUS_object
from scenicplus.cistromes import *
from scenicplus.enhancer_to_gene import get_search_space, calculate_regions_to_genes_relationships, GBM_KWARGS
from scenicplus.TF_to_gene import *
from scenicplus.grn_builder.gsea_approach import build_grn
from scenicplus.utils import format_egrns
from scenicplus.eregulon_enrichment import *
from scenicplus.plotting.correlation_plot import *
from scenicplus.preprocessing.filtering import apply_std_filtering_to_eRegulons
from scenicplus.RSS import *
from scenicplus.plotting.dotplot import *
from scenicplus.loom import *

1. Creating a cisTopic object¶

tutorial comes from https://pycistopic.readthedocs.io/en/latest/Single_sample_workflow-RTD.html#

In [6]:
# read count matrix of peaks 
count_matrix = pd.read_feather(projDir + 'data/atac_data/highToSomite.peaks.counts.feather')
# read peak names and cell names, and use them as row names and colume names for count matrix
peak_names = pd.read_csv(projDir + 'data/atac_data/peak.txt', header=None, sep='\t')
count_matrix.index = peak_names[0]
cell_names = pd.read_csv(projDir + 'data/atac_data/cell.txt', header=None, sep='\t')
count_matrix.columns = cell_names[0]
In [ ]:
# check the matrix
count_matrix
In [8]:
# create cisTopic object
cistopic_obj = create_cistopic_object(fragment_matrix=count_matrix)
# add cell information
cell_data =  pd.read_csv(projDir+'data/atac_data/cell.metadata.txt', sep='\t')
cistopic_obj.add_cell_data(cell_data)
2022-10-30 14:17:36,470 cisTopic     INFO     Converting fragment matrix to sparse matrix
2022-10-30 14:22:17,102 cisTopic     INFO     Creating CistopicObject
2022-10-30 14:22:28,669 cisTopic     INFO     Done!
In [9]:
# check attributes of cistopic_obj
cistopic_obj_attri = (vars(cistopic_obj))
print(cistopic_obj_attri.keys())
print(cistopic_obj_attri['cell_data'])
dict_keys(['fragment_matrix', 'binary_matrix', 'cell_names', 'region_names', 'cell_data', 'region_data', 'project', 'path_to_fragments', 'selected_model', 'projections'])
In [11]:
# save cistopic_obj
with open(outDir + 'zf_highToSomite_cisTopicObject.pkl', 'wb') as f:
  pickle.dump(cistopic_obj, f)

2. Run models¶

In [12]:
# load cisTopic object
infile = open(outDir + 'zf_highToSomite_cisTopicObject.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
In [13]:
cistopic_obj
Out[13]:
<pycisTopic.cistopic_class.CistopicObject at 0x2b840ee70f10>
In [ ]:
# run models
models=run_cgs_models(cistopic_obj,
                    n_topics=[400], 
                    n_cpu=40, 
                    n_iter=100,
                    random_state=555,
                    alpha=50,
                    alpha_by_topic=True,
                    eta=0.1,
                    eta_by_topic=False,
                    _temp_dir=tmpDir)
# save
with open(outDir + 'models/zf_highToSomite_models_400.pkl', 'wb') as f:
  pickle.dump(models, f)
2022-10-22 14:38:52,950	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265 
(run_cgs_model pid=236156) 2022-10-22 14:39:04,765 cisTopic     INFO     Running model with 200 topics
(run_cgs_model pid=236139) 2022-10-22 14:39:04,767 cisTopic     INFO     Running model with 90 topics
(run_cgs_model pid=236160) 2022-10-22 14:39:04,764 cisTopic     INFO     Running model with 100 topics
In [ ]:
'''
# this Mallet method is not working for some reasons 
from pycisTopic.lda_models import *
# configure path Mallet
path_to_mallet_binary = '/scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/bin/mallet'
# run models
models=run_cgs_models_mallet(path_to_mallet_binary,
                    cistopic_obj,
                    n_topics=[60],
                    n_cpu=40, # The parallelization is done within each model. 
                    n_iter=100,
                    random_state=555,
                    alpha=50,
                    alpha_by_topic=True,
                    eta=0.1,
                    eta_by_topic=False,
                    tmp_path='/scicore/home/schiera/liu0005/tmp/',
                    save_path='/scicore/home/schiera/liu0005/tmp/')
# save
with open(outDir + 'models/zf_highToSomite_mallet_models_60.pkl', 'wb') as f:
  pickle.dump(models, f)

3. Model selection¶

In [28]:
infile1 = open(outDir + 'models/zf_highToSomite_models_10.pkl', 'rb')
infile2 = open(outDir + 'models/zf_highToSomite_models_20.pkl', 'rb')
infile3 = open(outDir + 'models/zf_highToSomite_models_30_40.pkl', 'rb')
infile4 = open(outDir + 'models/zf_highToSomite_models_50.pkl', 'rb')
infile5 = open(outDir + 'models/zf_highToSomite_models_60_70_80.pkl', 'rb')
infile6 = open(outDir + 'models/zf_highToSomite_models_90_100_200.pkl', 'rb')
infile7 = open(outDir + 'models/zf_highToSomite_models_150.pkl', 'rb')
infile8 = open(outDir + 'models/zf_highToSomite_models_250.pkl', 'rb')
infile9 = open(outDir + 'models/zf_highToSomite_models_300.pkl', 'rb')
infile10 = open(outDir + 'models/zf_highToSomite_models_400.pkl', 'rb')

models1 = pickle.load(infile1)
models2 = pickle.load(infile2)
models3 = pickle.load(infile3)
models4 = pickle.load(infile4)
models5 = pickle.load(infile5)
models6 = pickle.load(infile6)
models7 = pickle.load(infile7)
models8 = pickle.load(infile8)
models9 = pickle.load(infile9)
models10 = pickle.load(infile10)

infile1.close()
infile2.close()
infile3.close()
infile4.close()
infile5.close()
infile6.close()
infile7.close()
infile8.close()
infile9.close()
infile10.close()

models = models1 + models2 + models3 + models4 + models5 + models6 + models7 + models8 + models9 + models10
In [29]:
model=evaluate_models(models,
                     return_model=True,
                     metrics=['Arun_2010','Cao_Juan_2009', 'Minmo_2011', 'loglikelihood'],
                     plot_metrics=False,
                     save= outDir + 'models/model_selection.pdf')
In [ ]:
model.
In [14]:
# add model to cisTopicObject
cistopic_obj.add_LDA_model(model)
In [15]:
# save
with open(outDir + 'zf_highToSomite_cisTopicObject_withTopics.pkl', 'wb') as f:
  pickle.dump(cistopic_obj, f)

4. Inferring candidate enhancer regions¶

4.1 Topic binarization & qc¶

In [25]:
# load cisTopic object
infile = open(outDir + 'zf_highToSomite_cisTopicObject_withTopics.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
In [ ]:
# binarize the topic-region distribution
region_bin_topics_top3k = binarize_topics(cistopic_obj, method='ntop', ntop = 3000)
region_bin_topics_otsu = binarize_topics(cistopic_obj, method='otsu',plot=True, num_columns=5, save= outDir + 'topic_binarization/otsu.pdf')
In [ ]:
# we can now binarize the cell-topic distribions.
binarized_cell_topic = binarize_topics(cistopic_obj, target='cell', method='li', plot=True, num_columns=5, nbins=100)
In [38]:
# we can compute the topic quality control metrics 
topic_qc_metrics = compute_topic_metrics(cistopic_obj)
topic_qc_metrics
In [ ]:
# create a figure dictionary to put all plots together
fig_dict={}
fig_dict['CoherenceVSAssignments']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Log10_Assignments', var_color='Gini_index', plot=False, return_fig=True)
fig_dict['AssignmentsVSCells_in_bin']=plot_topic_qc(topic_qc_metrics, var_x='Log10_Assignments', var_y='Cells_in_binarized_topic', var_color='Gini_index', plot=False, return_fig=True)
fig_dict['CoherenceVSCells_in_bin']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Cells_in_binarized_topic', var_color='Gini_index', plot=False, return_fig=True)
fig_dict['CoherenceVSRegions_in_bin']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Regions_in_binarized_topic', var_color='Gini_index', plot=False, return_fig=True)
fig_dict['CoherenceVSMarginal_dist']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Marginal_topic_dist', var_color='Gini_index', plot=False, return_fig=True)
fig_dict['CoherenceVSGini_index']=plot_topic_qc(topic_qc_metrics, var_x='Coherence', var_y='Gini_index', var_color='Gini_index', plot=False, return_fig=True)
In [44]:
# plot topic stats in one figure
fig=plt.figure(figsize=(40, 43))
i = 1
for fig_ in fig_dict.keys():
    plt.subplot(2, 3, i)
    img = fig2img(fig_dict[fig_]) 
    plt.imshow(img)
    plt.axis('off')
    i += 1
plt.subplots_adjust(wspace=0, hspace=-0.70)
fig.savefig(outDir + 'topic_binarization/Topic_qc.pdf', bbox_inches='tight')
plt.show()
In [ ]:
topic_annot = topic_annotation(cistopic_obj, annot_var='celltype', binarized_cell_topic=binarized_cell_topic, general_topic_thr = 0.2)
topic_annot
In [ ]:
# we can merge the topic metrics and their annotation in a data frame.
topic_qc_metrics = pd.concat([topic_annot[['celltype', 'Ratio_cells_in_topic', 'Ratio_group_in_population']], topic_qc_metrics], axis=1)
In [94]:
topic_qc_metrics.head(1)
Out[94]:
celltype Ratio_cells_in_topic Ratio_group_in_population Log10_Assignments Assignments Cells_in_binarized_topic Regions_in_binarized_topic Coherence Marginal_topic_dist Gini_index
Topic1 hatching gland(6somite), hatching gland(bud), lens placode(6somite), endoderm(75epi), endothelial progenitors(6somite), cephalic mesoderm(6somite), cephalic mesoderm(bud), prechordal plate(75epi), lateral plate mesoderm(75epi), lateral plate mesoderm(bud), endoderm(6somite), notochord(75epi), ventral diencephalon(6somite), ventral diencephalon(bud), neural floorplate(6somite), endoderm(bud), notochord(bud) 0.035104 0.088066 5.996757 992561 1439 2239 -1.742742 0.002014 0.889426
In [27]:
# save
with open(outDir + 'topic_binarization/Topic_qc_metrics_annot.pkl', 'wb') as f:
  pickle.dump(topic_qc_metrics, f)
with open(outDir + 'topic_binarization/binarized_cell_topic.pkl', 'wb') as f:
  pickle.dump(binarized_cell_topic, f)
with open(outDir + 'topic_binarization/binarized_topic_region_otsu.pkl', 'wb') as f:
  pickle.dump(region_bin_topics_otsu, f)
with open(outDir + 'topic_binarization/binarized_topic_region_top3k.pkl', 'wb') as f:
  pickle.dump(region_bin_topics_top3k, f)

4.2 Differentially Accessible Regions (DARs)¶

In [2]:
# load cisTopic object
infile = open(outDir + 'zf_highToSomite_cisTopicObject_withTopics.pkl', 'rb')
cistopic_obj = pickle.load(infile)
infile.close()
In [2]:
# impute the region accessibility exploting the cell-topic and topic-region probabilities
imputed_acc_obj = impute_accessibility(cistopic_obj, selected_cells=None, selected_regions=None, scale_factor=10**6)
2022-10-24 19:40:18,644 cisTopic     INFO     Imputing drop-outs
2022-10-24 19:41:01,055 cisTopic     INFO     Scaling
2022-10-24 19:41:39,657 cisTopic     INFO     Keep non zero rows
2022-10-24 19:42:30,574 cisTopic     INFO     Imputed accessibility sparsity: 0.48989547717527104
2022-10-24 19:42:30,578 cisTopic     INFO     Create CistopicImputedFeatures object
2022-10-24 19:42:30,579 cisTopic     INFO     Done!
In [7]:
# save
with open(outDir + 'DARs/imputed_acc_obj.pkl', 'wb') as f:
  pickle.dump(imputed_acc_obj, f)
In [4]:
'''
# log-normalize the imputed data
# even for 270g memory we can only process for 25000 cells, so we need split the 40992 cells into two parts
c1 = imputed_acc_obj.mtx[:,0:20000] ## get the first 20'000 cells
from pycisTopic.diff_features import *
scale_factor=10**6 # since we have half million peaks, so scale factor should be in the same magnitude 
c1 = normalize(c1, norm="l1", axis=0) # for each column (a cell), each elelment divided by the sum of all elelemnts in this coulmn  
c1 *= scale_factor
c1 = np.log1p(c1) ## except 0, the minimal value is 0.8285494910665001, the second minimal is 1.2753598240517567 
c1 = np.around(c1,decimals=2) 
## we need save c1 and remove c1, otherwise there is no memory for c2
with open(outDir + 'DARs/normalized_imputed_acc_1-20000cells_ndarray.pkl', 'wb') as f:
  pickle.dump(c1, f)
del c1

c2 = imputed_acc_obj.mtx[:,20000:40992] ## get the last 20'992 cells
from pycisTopic.diff_features import *
scale_factor=10**6
c2 = normalize(c2, norm="l1", axis=0) # for each column (a cell), each elelment divided by the sum of all elelemnts in this coulmn  
c2 *= scale_factor
c2 = np.log1p(c2)
c2 = np.around(c2,decimals=2) 
with open(outDir + 'DARs/normalized_imputed_acc_20001-40992cells_ndarray.pkl', 'wb') as f:
  pickle.dump(c2, f)
del c2
In [6]:
'''
## change imputed_acc_obj values as normalized ones 
# change data type to float32, since the normalized value is in float32, we need use the normalized value replace the current ones 
imputed_acc_obj.mtx = imputed_acc_obj.mtx.astype(np.float32)

infile1 = open(outDir + 'DARs/normalized_imputed_acc_1-20000cells_ndarray.pkl', 'rb')
c1 = pickle.load(infile1)
infile1.close()
imputed_acc_obj.mtx[:,0:20000] = c1[:,0:20000]
del c1

infile2 = open(outDir + 'DARs/normalized_imputed_acc_20001-40992cells_ndarray.pkl', 'rb')
c2 = pickle.load(infile2)
infile2.close()
imputed_acc_obj.mtx[:,20000:40992] = c2[:,0:20992]
del c2
In [15]:
'''
# save normalized one 
with open(outDir + 'DARs/normalized_imputed_acc_obj.pkl', 'wb') as f:
  pickle.dump(imputed_acc_obj, f)
In [17]:
'''
del imputed_acc_obj
In [1]:
'''
infile = open(outDir + 'DARs/normalized_imputed_acc_obj.pkl', 'rb')
normalized_imputed_acc_obj = pickle.load(infile)
infile.close()
In [2]:
'''
# identify highly variable regions
variable_regions = find_highly_variable_features(normalized_imputed_acc_obj,
                                           min_disp = 0.05,
                                           min_mean = 0.0125,
                                           max_mean = 3,
                                           max_disp = np.inf,
                                           n_bins=20,
                                           n_top_features=None,
                                           plot=True,
                                           save= outDir + 'DARs/HVR_plot.pdf')
2022-10-25 12:41:33,475 cisTopic     INFO     Calculating mean
2022-10-25 12:41:40,040 cisTopic     INFO     Calculating variance
2022-10-25 12:43:00,037 cisTopic     INFO     Done!
In [7]:
'''
# save variable_regions
with open(outDir + 'DARs/variable_regions.pkl', 'wb') as f:
  pickle.dump(variable_regions, f)

    
In [ ]:
'''
## identify differentially accessible regions 
# load imputed_acc object
infile1 = open(outDir + 'DARs/imputed_acc_obj.pkl', 'rb')
imputed_acc_obj = pickle.load(infile1)
infile1.close()
# load varaible regions
import pickle
infile2 = open(outDir + 'DARs/variable_regions.pkl', 'rb')
variable_regions = pickle.load(infile2)
infile2.close()

# we do this part in archr, since there is enough memory even we ask 270g  
markers_dict= find_diff_features(cistopic_obj,
                      imputed_acc_obj,
                      variable='celltype',
                      var_features=variable_regions,
                      contrasts=None,
                      adjpval_thr=0.05,
                      log2fc_thr=np.log2(1.5),
                      n_cpu=1) # there is a memory issue if we set pvalue > 1, each cell type costs 7 min
                               # even set cup=1, there is still no enough memory after finished runing 8 cell types
We use the DARs from ArchR, which is much faster and requires less memory.¶
In [48]:
DAR = pd.read_csv(outDir + '/DARs/DAR.txt', sep='\t') 
cell_types=np.unique(DAR["Contrast"])             
markers_dict = {}
for i in range(len(cell_types)):
        markers_dict[np.unique(DAR["Contrast"])[i]] = DAR[DAR["Contrast"]==cell_types[i]]
In [49]:
x = [print(x + ': '+ str(len(markers_dict[x]))) for x in markers_dict.keys()]
adaxial cells(6somite): 1059
adaxial cells(75epi): 784
adaxial cells(bud): 2782
anterior neural plate border(bud): 3009
cephalic mesoderm(6somite): 659
cephalic mesoderm(bud): 3683
deep cells(oblong): 2731
dorsal anterior(50epi): 710
dorsal anterior(shield): 1172
dorsal diencephalon(6somite): 1036
dorsal diencephalon(bud): 3642
dorsal ectoderm(50epi): 451
dorsal ectoderm(shield): 853
dorsal posterior(50epi): 777
dorsal posterior(shield): 1534
ectoderm(30epi): 2033
ectoderm(dome): 2274
endoderm(6somite): 201
endoderm(75epi): 1612
endoderm(bud): 2009
endoderm(shield): 1434
epidermis(6somite): 11116
epidermis(75epi): 3028
epidermis(bud): 9070
evl(30epi): 3715
evl(50epi): 21119
evl(6somite): 5082
evl(75epi): 25519
evl(bud): 15179
evl(dome): 1131
evl(shield): 33715
hatching gland(6somite): 918
hatching gland(bud): 2151
heart field(6somite): 2164
high cells: 204
hindbrain(6somite): 3369
hindbrain(bud): 3758
lateral plate mesoderm(75epi): 3551
lateral plate mesoderm(bud): 4571
lens placode(6somite): 361
margin tail(50epi): 3259
margin tail(75epi): 1546
margin tail(shield): 1483
midbrain(6somite): 5509
midbrain(bud): 3256
neural crest(6somite): 2033
neural plate anterior(75epi): 2240
neural plate posterior(75epi): 2880
non-dorsal margin(30epi): 706
non-dorsal margin(dome): 131
notochord(6somite): 713
notochord(75epi): 1277
notochord(bud): 2224
olfactory or adenohypophyseal placode(6somite): 1751
optic primordium(6somite): 9306
optic primordium(bud): 10294
otic placode(6somite): 1244
posterior neural plate border(6somite): 8645
posterior neural plate border(bud): 3804
prechordal plate(75epi): 660
pronephric duct+blood island(6somite): 286
psm(6somite): 3012
psm(75epi): 5030
psm(bud): 8819
somite(6somite): 9405
spinal cord(6somite): 4283
spinal cord(bud): 3317
tailbud(6somite): 1121
tailbud(bud): 3150
telencephalon(6somite): 2134
telencephalon(bud): 1754
ventral diencephalon(6somite): 291
ventral diencephalon(bud): 2267
ventral ectoderm(50epi): 1430
ventral ectoderm(shield): 1048
ventrolateral mesoderm(shield): 6203
ventrolateral mesoendoderm(50epi): 8189
ysl(50epi): 3597
ysl(75epi): 8936
ysl(bud): 11674
ysl(shield): 9470
In [80]:
# save
with open(outDir + 'DARs/DARs.pkl', 'wb') as f:
  pickle.dump(markers_dict, f)

5. Motif enrichment analysis using pycistarget¶

tutorial comes from: https://scenicplus.readthedocs.io/en/latest/pbmc_multiome_tutorial.html#Motif-enrichment-analysis-using-pycistarget

In [7]:
region_bin_topics_otsu = pickle.load(open(os.path.join(outDir, 'topic_binarization/binarized_topic_region_otsu.pkl'), 'rb'))
region_bin_topics_top3k = pickle.load(open(os.path.join(outDir, 'topic_binarization/binarized_topic_region_top3k.pkl'), 'rb'))
markers_dict = pickle.load(open(os.path.join(outDir, 'DARs/DARs.pkl'), 'rb'))
In [8]:
# convert to dictionary of pyranges objects.
region_sets = {}
region_sets['topics_otsu'] = {}
region_sets['topics_top_3'] = {}
region_sets['DARs'] = {}
for topic in region_bin_topics_otsu.keys():
    regions = region_bin_topics_otsu[topic].index[region_bin_topics_otsu[topic].index.str.startswith('chr')] #only keep regions on known chromosomes
    region_sets['topics_otsu'][topic] = pr.PyRanges(region_names_to_coordinates(regions))
for topic in region_bin_topics_top3k.keys():
    regions = region_bin_topics_top3k[topic].index[region_bin_topics_top3k[topic].index.str.startswith('chr')] #only keep regions on known chromosomes
    region_sets['topics_top_3'][topic] = pr.PyRanges(region_names_to_coordinates(regions))
for DAR in markers_dict.keys():
    regions = markers_dict[DAR].index[markers_dict[DAR].index.str.startswith('chr')] #only keep regions on known chromosomes
    region_sets['DARs'][DAR] = pr.PyRanges(region_names_to_coordinates(regions))
In [14]:
for key in region_sets.keys():
    print(f'{key}: {region_sets[key].keys()}')
topics_otsu: 
topics_top_3: 
DARs: 
In [19]:
region_sets['topics_otsu']['Topic1']
Out[19]:
Chromosome Start End
0 chr1 39455761 39456261
1 chr1 45758707 45759207
2 chr1 46493435 46493935
3 chr1 168701 169201
4 chr1 30627428 30627928
... ... ... ...
2234 chr25 22805637 22806137
2235 chr25 3870519 3871019
2236 chr25 31564724 31565224
2237 chr25 2815675 2816175
2238 chr25 18216903 18217403

2239 rows × 3 columns

In [ ]:
# change the data format for rankings database and scores database
rankings_db = os.path.join(projDir, 'data/motif_data/motif.ranks.v2.feather')
scores_db =  os.path.join(projDir, 'data/motif_data/motif.scores.v2.feather')

rankings_data = pd.read_feather(rankings_db)
rankings_data2 = rankings_data.iloc[:,0:444653].astype('int32') 
rankings_data2.iloc[:,444653:444654] = rankings_data.iloc[:,444653:444654]
rankings_data3 = pd.concat([rankings_data2, rankings_data.iloc[:,444653:444654]], axis=1)
rankings_data3.to_feather ("/scicore/home/schiera/liu0005/scMultiome_embryo/scenicplus_analysis/data/motif_data/cluster_SCREEN.regions_vs_motifs.rankings.feather")

scores_data = pd.read_feather(scores_db)
scores_data2 = scores_data.iloc[:,0:444653].astype('float32')  
scores_data2.iloc[:,444653:444654] = scores_data.iloc[:,444653:444654]
scores_data3 = pd.concat([scores_data2, scores_data.iloc[:,444653:444654]], axis=1)
scores_data3.to_feather ("/scicore/home/schiera/liu0005/scMultiome_embryo/scenicplus_analysis/data/motif_data/cluster_SCREEN.regions_vs_motifs.scores.v2.feather")
In [20]:
rankings_data
Out[20]:
chr10:10002944-10003444 chr10:10003889-10004389 chr10:10004783-10005283 chr10:10008043-10008543 chr10:10010515-10011015 chr10:10013193-10013693 chr10:10015603-10016103 chr10:10016141-10016641 chr10:10017245-10017745 chr10:10017906-10018406 ... chr9:9980361-9980861 chr9:9982342-9982842 chr9:99830-100330 chr9:9986744-9987244 chr9:9989524-9990024 chr9:9990177-9990677 chr9:9992237-9992737 chr9:9994520-9995020 chr9:9998473-9998973 motifs
0 210970.0 140162.0 329619.0 139625.0 12098.0 141625.0 252753.0 440251.0 95493.0 309483.0 ... 43980.0 186246.0 339773.0 81884.0 159885.0 111277.0 369298.0 23704.0 258417.0 AL807792.3
1 210970.0 140162.0 329619.0 139625.0 12098.0 141625.0 252753.0 440251.0 95493.0 309483.0 ... 43980.0 186246.0 339773.0 81884.0 159885.0 111277.0 369298.0 23704.0 258417.0 atoh1a
2 210970.0 140162.0 329619.0 139625.0 12098.0 141625.0 252753.0 440251.0 95493.0 309483.0 ... 43980.0 186246.0 339773.0 81884.0 159885.0 111277.0 369298.0 23704.0 258417.0 atoh1b
3 210970.0 140162.0 329619.0 139625.0 12098.0 141625.0 252753.0 440251.0 95493.0 309483.0 ... 43980.0 186246.0 339773.0 81884.0 159885.0 111277.0 369298.0 23704.0 258417.0 atoh1c
4 210970.0 140162.0 329619.0 139625.0 12098.0 141625.0 252753.0 440251.0 95493.0 309483.0 ... 43980.0 186246.0 339773.0 81884.0 159885.0 111277.0 369298.0 23704.0 258417.0 bloc1s2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
907 116087.0 275516.0 421974.0 166876.0 55116.0 234540.0 233367.0 156814.0 355398.0 384833.0 ... 185121.0 326672.0 390268.0 134462.0 354930.0 97549.0 247974.0 361489.0 5034.0 znf384l
908 72519.0 153827.0 112587.0 152487.0 164072.0 72289.0 427372.0 388657.0 84688.0 397434.0 ... 151589.0 325407.0 271012.0 292360.0 161830.0 130951.0 376447.0 280961.0 377607.0 znf410
909 340369.0 113664.0 280685.0 409479.0 258887.0 195843.0 364326.0 44322.0 337690.0 187127.0 ... 230116.0 162260.0 406442.0 78966.0 420317.0 301185.0 142135.0 182187.0 433461.0 znf423
910 382654.0 65343.0 9933.0 137071.0 377358.0 382655.0 160793.0 409440.0 124034.0 148695.0 ... 188501.0 210257.0 320600.0 294394.0 33965.0 443686.0 254570.0 4557.0 428660.0 znf740a
911 382654.0 65343.0 9933.0 137071.0 377358.0 382655.0 160793.0 409440.0 124034.0 148695.0 ... 188501.0 210257.0 320600.0 294394.0 33965.0 443686.0 254570.0 4557.0 428660.0 znf740b

912 rows × 444654 columns

In [9]:
# define rankings, score and motif annotation database    
rankings_db = os.path.join(projDir, 'data/motif_data/cluster_SCREEN.regions_vs_motifs.rankings.v2.feather')
scores_db =  os.path.join(projDir, 'data/motif_data/cluster_SCREEN.regions_vs_motifs.scores.v2.feather')
motif_annotation = os.path.join(projDir, 'data/motif_data/motif.annotation.txt')
In [ ]:
# run pycistarget 
species = 'danio_rerio' 
run_pycistarget(
    region_sets = region_sets,
    species = species, 
    biomart_host = 'http://apr2020.archive.ensembl.org/', 
    save_path = os.path.join(outDir, 'motifs'),
    ctx_db_path = rankings_db,
    dem_db_path = scores_db,
    promoter_space = 1000,
    run_without_promoters = True,
    path_to_motif_annotations = motif_annotation, 
    _temp_dir = tmpDir, 
    annotation_version = 'danio_code_v1' 
    ) 
    
In [ ]:
# run pycistarget, ctx_nes_threshold set as 0  
species = 'danio_rerio'
run_pycistarget(
    region_sets = region_sets,
    species = species, 
    biomart_host = 'http://apr2020.archive.ensembl.org/',
    save_path = os.path.join(outDir, 'motifs/motifs_without_auc_filter'),
    ctx_db_path = rankings_db,
    dem_db_path = scores_db,
    ctx_nes_threshold = 0, 
    promoter_space = 1000,
    run_without_promoters = True,
    path_to_motif_annotations = motif_annotation, 
    n_cpu = 1, 
    _temp_dir = tmpDir, 
    annotation_version = 'danio_code_v1' 
    ) 
In [ ]:
# explore the results
menr = dill.load(open(os.path.join(outDir, 'motifs/menr.pkl'), 'rb'))
In [65]:
for key, value in menr.items() :
    print (key)
CTX_topics_otsu_All
CTX_topics_otsu_No_promoters
DEM_topics_otsu_All
DEM_topics_otsu_No_promoters
CTX_topics_top_3_All
CTX_topics_top_3_No_promoters
DEM_topics_top_3_All
DEM_topics_top_3_No_promoters
CTX_DARs_All
CTX_DARs_No_promoters
DEM_DARs_All
DEM_DARs_No_promoters
In [ ]:
for k1,v1 in menr['CTX_DARs_No_promoters'].items():
    
    print (k1)
    for k2, v2 in menr['CTX_DARs_No_promoters'][k1].cistromes['Database'].items():
        print (k2)
In [ ]:
# output all tfbs for each cell type specific peaks
df = pd.DataFrame(columns = ['celltype','regulon','enhancer'])
for k1,v1 in menr['CTX_DARs_All'].items():
    for k2, v2 in menr['CTX_DARs_All'][k1].cistromes['Database'].items():
        for v3 in v2: 
            df2 = pd.DataFrame({'celltype': k1, 'regulon': k2, 'enhancer': v3},index=[0])
            df = df.append(df2)
df.to_csv("output/motifs/CTX_DARs_All_regulons.txt",sep ='\t',index=False,header=True)
In [ ]:
# when working with the DEM method instead of the CTX method
df = pd.DataFrame(columns = ['celltype','regulon','enhancer'])
for k1,v1 in menr['DEM_DARs_No_promoters'].cistromes['Database'].items():
    for k2, v2 in menr['DEM_DARs_No_promoters'].cistromes['Database'][k1].items():
        for v3 in v2: 
            df2 = pd.DataFrame({'celltype': k1, 'regulon': k2, 'enhancer': v3},index=[0])
            df = df.append(df2)
df.to_csv("output/motifs/DEM_DARs_No_promoters_regulons.txt",sep ='\t',index=False,header=True)
In [ ]:
# output enriched motifs for each cell type 
df = pd.DataFrame(columns = ['Logo','Region_set','Direct_annot','Orthology_annot','NES','AUC','Rank_at_max','Motif_hits'])
for k1,v1 in menr['CTX_DARs_All'].items():
    df2 = menr['CTX_DARs_All'][k1].motif_enrichment
    df = df.append(df2)
df.to_csv("output/motifs/CTX_DARs_All_regulons_motif_enrichment.txt",sep ='\t',index=False,header=True)
In [17]:
menr['CTX_DARs_All']['adaxial cells(6somite)'].motif_enrichment
Out[17]:
Logo Region_set Direct_annot Orthology_annot NES AUC Rank_at_max Motif_hits
myog <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/myog.png" width="200" > adaxial cells(6somite) myog NaN 8.139853 0.024555 21907.0 293
tal1 <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/tal1.png" width="200" > adaxial cells(6somite) tal1 NaN 5.777286 0.018207 21936.0 292
figla <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/figla.png" width="200" > adaxial cells(6somite) figla NaN 5.761635 0.018165 18587.0 182
myod1 <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/myod1.png" width="200" > adaxial cells(6somite) myod1 NaN 5.610823 0.017760 22208.0 388
ttf1.4 <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/ttf1.4.png" width="200" > adaxial cells(6somite) ttf1.4 NaN 5.342396 0.017038 21895.0 229
myf6 <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/myf6.png" width="200" > adaxial cells(6somite) myf6 NaN 5.342396 0.017038 21895.0 229
meis3 <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/meis3.png" width="200" > adaxial cells(6somite) meis3 NaN 4.601930 0.015049 16295.0 157
msc <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/msc.png" width="200" > adaxial cells(6somite) msc NaN 4.549763 0.014909 22000.0 242
tcf21 <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/tcf21.png" width="200" > adaxial cells(6somite) tcf21 NaN 4.400689 0.014508 22102.0 245
mespab <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/mespab.png" width="200" > adaxial cells(6somite) mespab NaN 4.113608 0.013737 21949.0 201
mespba <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/mespba.png" width="200" > adaxial cells(6somite) mespba NaN 4.113608 0.013737 21949.0 201
mespbb <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/mespbb.png" width="200" > adaxial cells(6somite) mespbb NaN 4.113608 0.013737 21949.0 201
mespaa <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/mespaa.png" width="200" > adaxial cells(6somite) mespaa NaN 4.113608 0.013737 21949.0 201
TCF4 <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/TCF4.png" width="200" > adaxial cells(6somite) TCF4 NaN 3.668444 0.012540 21020.0 194
scrt1a <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/scrt1a.png" width="200" > adaxial cells(6somite) scrt1a NaN 3.597780 0.012350 10775.0 79
scrt2 <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/scrt2.png" width="200" > adaxial cells(6somite) scrt2 NaN 3.597780 0.012350 10775.0 79
scrt1b <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/scrt1b.png" width="200" > adaxial cells(6somite) scrt1b NaN 3.597780 0.012350 10775.0 79
zic3 <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/zic3.png" width="200" > adaxial cells(6somite) zic3 NaN 3.592721 0.012337 1264.0 18
mef2d <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/mef2d.png" width="200" > adaxial cells(6somite) mef2d NaN 3.287777 0.011517 829.0 13
mef2ab <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/mef2ab.png" width="200" > adaxial cells(6somite) mef2ab NaN 3.270388 0.011471 829.0 13
mef2aa <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/mef2aa.png" width="200" > adaxial cells(6somite) mef2aa NaN 3.270388 0.011471 829.0 13
gli2b <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/gli2b.png" width="200" > adaxial cells(6somite) gli2b NaN 3.221540 0.011340 17827.0 139
gli1 <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/gli1.png" width="200" > adaxial cells(6somite) gli1 NaN 3.221540 0.011340 17827.0 139
neurod1 <img src="https://motifcollections.aertslab.org/danio_code_v1/logos/neurod1.png" width="200" > adaxial cells(6somite) neurod1 NaN 3.014450 0.010783 22182.0 170
In [66]:
menr['DEM_DARs_All'].DEM_results('adaxial cells(6somite)')
Out[66]:
Logo Contrast Direct_annot Orthology_annot Log2FC Adjusted_pval Mean_fg Mean_bg Motif_hit_thr Motif_hits
tcf21 adaxial cells(6somite) tcf21 NaN 2.118818 0.0 5.305873 1.2216 3.0 682.0
mbd2 adaxial cells(6somite) mbd2 NaN 2.017327 0.000001 1.348093 0.333 3.0 239.0
patz1 adaxial cells(6somite) patz1 NaN 1.667604 0.000001 1.955043 0.6154 3.0 418.0
hic1 adaxial cells(6somite) hic1 NaN 1.576642 0.000002 1.415373 0.47452 3.0 333.0
zbtb14 adaxial cells(6somite) zbtb14 NaN 1.316687 0.000431 1.248556 0.50124 3.0 294.0
tcf12 adaxial cells(6somite) tcf12 NaN 1.175431 0.0 10.340021 4.57806 3.0 973.0
TCF4 adaxial cells(6somite) TCF4 NaN 1.055659 0.0 5.953263 2.86398 3.0 495.0
hey1 adaxial cells(6somite) hey1 NaN 1.038834 0.0 2.843275 1.38388 3.0 534.0
heyl adaxial cells(6somite) heyl NaN 1.038834 0.0 2.843275 1.38388 3.0 534.0
tcf3a adaxial cells(6somite) tcf3a NaN 0.912908 0.0 10.153653 5.392742 3.0 1023.0
tcf3b adaxial cells(6somite) tcf3b NaN 0.912908 0.0 10.153653 5.392742 3.0 1023.0
tfap4 adaxial cells(6somite) tfap4 NaN 0.775537 0.0 8.948742 5.227597 3.0 998.0
AL807792.3 adaxial cells(6somite) AL807792.3 NaN 0.735716 0.0 8.7317 5.24356 3.0 998.0
atoh1a adaxial cells(6somite) atoh1a NaN 0.735716 0.0 8.7317 5.24356 3.0 998.0
atoh1b adaxial cells(6somite) atoh1b NaN 0.735716 0.0 8.7317 5.24356 3.0 998.0
atoh1c adaxial cells(6somite) atoh1c NaN 0.735716 0.0 8.7317 5.24356 3.0 998.0
bloc1s2 adaxial cells(6somite) bloc1s2 NaN 0.735716 0.0 8.7317 5.24356 3.0 998.0
klf15 adaxial cells(6somite) klf15 NaN 0.644429 0.0 3.397787 2.17372 3.0 525.0
si:ch211-117k10.3 adaxial cells(6somite) si:ch211-117k10.3 NaN 0.644429 0.0 3.397787 2.17372 3.0 525.0
msc adaxial cells(6somite) msc NaN 0.642546 0.0 9.303607 5.959719 3.0 1014.0
myog adaxial cells(6somite) myog NaN 0.636861 0.0 9.977282 6.416499 3.0 1043.0
znf148 adaxial cells(6somite) znf148 NaN 0.610756 0.000001 3.114345 2.03944 3.0 501.0
zbtb18 adaxial cells(6somite) zbtb18 NaN 0.594757 0.0 5.468046 3.620698 3.0 775.0
zbtb42 adaxial cells(6somite) zbtb42 NaN 0.594757 0.0 5.468046 3.620698 3.0 775.0
srebf1 adaxial cells(6somite) srebf1 NaN 0.583174 0.0 3.290217 2.1962 3.0 550.0
rreb1a adaxial cells(6somite) rreb1a NaN 0.538285 0.0 6.766159 4.659101 3.0 780.0
rreb1b adaxial cells(6somite) rreb1b NaN 0.538285 0.0 6.766159 4.659101 3.0 780.0
znf598 adaxial cells(6somite) znf598 NaN 0.538285 0.0 6.766159 4.659101 3.0 780.0
CABZ01090890.1 adaxial cells(6somite) CABZ01090890.1 NaN 0.53278 0.000011 2.525326 1.745559 3.0 429.0
atf6 adaxial cells(6somite) atf6 NaN 0.53278 0.000011 2.525326 1.745559 3.0 429.0
runx3 adaxial cells(6somite) runx3 NaN 0.522043 0.001215 2.306195 1.606 3.0 419.0
myf6 adaxial cells(6somite) myf6 NaN 0.520044 0.0 9.410133 6.56216 3.0 1039.0
ttf1.4 adaxial cells(6somite) ttf1.4 NaN 0.520044 0.0 9.410133 6.56216 3.0 1039.0

6.inferring enhancer-driven Gene Regulatory Networks (eGRNs) using SCENIC+¶

tutorial comes from: https://scenicplus.readthedocs.io/en/latest/Scenicplus_step_by_step-RTD.html#

In [7]:
# load the data for SCENIC+
adata = sc.read_h5ad(os.path.join(projDir, 'data/rna_data/highTOsomite6.h5ad'))
cistopic_obj = dill.load(open(os.path.join(projDir, 'output/zf_highToSomite_cisTopicObject_withTopics.pkl'), 'rb'))
menr = dill.load(open(os.path.join(projDir, 'output/motifs/menr.pkl'), 'rb'))
In [8]:
# check cell name in scATAC-seq data 
cistopic_obj_attri = (vars(cistopic_obj))
print(cistopic_obj_attri.keys())
cistopic_obj_attri['cell_data'].head(3)
dict_keys(['fragment_matrix', 'binary_matrix', 'cell_names', 'region_names', 'cell_data', 'region_data', 'project', 'path_to_fragments', 'selected_model', 'projections'])
Out[8]:
cisTopic_nr_frag cisTopic_log_nr_frag cisTopic_nr_acc cisTopic_log_nr_acc sample_id n_genes total_counts pct_conuts_mt celltype
high_CGTGCTGCATGCTTAG-1___cisTopic 41625 4.619354 34712 4.54048 cisTopic 921 1446 2.297297 high cells
high_GGGCAATAGTTAACCA-1___cisTopic 32104 4.506559 27277 4.435797 cisTopic 1566 2754 30.030488 high cells
high_GTACAATGTAGGATCC-1___cisTopic 32947 4.517816 28057 4.448041 cisTopic 1247 2085 18.427230 high cells
In [9]:
# check the cell name in scRNA-seq data 
adata.obs.head(3)
Out[9]:
orig.ident nCount_RNA nFeature_RNA percent.mt nCount_SCT nFeature_SCT orig.ident2 cell.type
high_CGTGCTGCATGCTTAG-1 high 1446.0 921 2.297297 2921.0 1007 1-high high cells
high_GGGCAATAGTTAACCA-1 high 2754.0 1566 30.030488 3259.0 1566 1-high high cells
high_GTACAATGTAGGATCC-1 high 2085.0 1247 18.427230 3361.0 1247 1-high high cells
In [10]:
# create the SCENIC+ object 
scplus_obj = create_SCENICPLUS_object(
    GEX_anndata = adata.raw.to_adata(),
    cisTopic_obj = cistopic_obj,
    menr = menr, 
    bc_transform_func = lambda x: f'{x}___cisTopic' 
)
scplus_obj.X_EXP = np.array(scplus_obj.X_EXP.todense())
scplus_obj
2022-11-07 21:41:11,789 cisTopic     INFO     Imputing drop-outs
2022-11-07 21:41:50,687 cisTopic     INFO     Scaling
2022-11-07 21:42:33,348 cisTopic     INFO     Keep non zero rows
2022-11-07 21:43:24,993 cisTopic     INFO     Imputed accessibility sparsity: 0.48989547717527104
2022-11-07 21:43:24,994 cisTopic     INFO     Create CistopicImputedFeatures object
2022-11-07 21:43:24,995 cisTopic     INFO     Done!
Out[10]:
SCENIC+ object with n_cells x n_genes = 40992 x 27406 and n_cells x n_regions = 40992 x 444253
	metadata_regions:'Chromosome', 'Start', 'End', 'Width', 'cisTopic_nr_frag', 'cisTopic_log_nr_frag', 'cisTopic_nr_acc', 'cisTopic_log_nr_acc'
	metadata_genes:'_index'
	metadata_cell:'GEX_orig.ident', 'GEX_nCount_RNA', 'GEX_nFeature_RNA', 'GEX_percent.mt', 'GEX_nCount_SCT', 'GEX_nFeature_SCT', 'GEX_orig.ident2', 'GEX_cell.type', 'ACC_cisTopic_nr_frag', 'ACC_cisTopic_log_nr_frag', 'ACC_cisTopic_nr_acc', 'ACC_cisTopic_log_nr_acc', 'ACC_sample_id', 'ACC_n_genes', 'ACC_total_counts', 'ACC_pct_conuts_mt', 'ACC_celltype'
	menr:'CTX_topics_otsu_All', 'CTX_topics_otsu_No_promoters', 'DEM_topics_otsu_All', 'DEM_topics_otsu_No_promoters', 'CTX_topics_top_3_All', 'CTX_topics_top_3_No_promoters', 'DEM_topics_top_3_All', 'DEM_topics_top_3_No_promoters', 'CTX_DARs_All', 'CTX_DARs_No_promoters', 'DEM_DARs_All', 'DEM_DARs_No_promoters'
	dr_cell:'GEX_X_umap'
In [ ]:
# check scplus_obj gene name 
scplus_obj.gene_names 
In [19]:
scplus_obj.metadata_genes = adata.var.copy(deep=True)
In [20]:
scplus_obj.gene_names 
Out[20]:
Index(['ptpn12', 'phtf2', 'phtf2.1', 'CU856344.1', 'si:zfos-932h1.3', 'mansc1',
       'lrp6', 'dusp16', 'crebl2', 'gpr19',
       ...
       'ENSDARG00000086262', 'ENSDARG00000077638', 'ENSDARG00000086236',
       'ENSDARG00000090041', 'ENSDARG00000087469', 'ENSDARG00000059507',
       'ENSDARG00000012496', 'ENSDARG00000098374', 'ENSDARG00000116593',
       'ENSDARG00000088842'],
      dtype='object', length=27406)
In [ ]:
# save scplus_obj
with open(outDir + 'zf_highToSomite_scplusObject.pkl', 'wb') as f:
    pickle.dump(scplus_obj, f)
In [7]:
# only use variable genes
variable_genes =  pd.read_csv(projDir+'data/rna_data/variable_genes.txt', sep='\t')
variable_genes_list = variable_genes['gene name'].astype(str).tolist()
scplus_obj.subset(genes=variable_genes_list)
In [11]:
# save scplus_obj
with open(outDir + 'zf_highToSomite_scplusObject_withFilter.pkl', 'wb') as f:
    pickle.dump(scplus_obj, f)
In [12]:
# generate cistromes
start_time = time.time()
merge_cistromes(scplus_obj)
time = time.time()-start_time
print(time/60)
0.6433848579724629
In [13]:
# infer enhancer to gene relationships
get_search_space(scplus_obj,
                 biomart_host = 'http://apr2020.archive.ensembl.org/',
                 species = 'drerio',
                 assembly = 'danRer11',
                 upstream = [1000, 150000],
                 downstream = [1000, 150000]) 
2022-11-08 11:24:36,499 R2G          INFO     Downloading gene annotation from biomart dataset: drerio_gene_ensembl
2022-11-08 11:25:01,449 R2G          INFO     Downloading chromosome sizes from: http://hgdownload.cse.ucsc.edu/goldenPath/danRer11/bigZips/danRer11.chrom.sizes
2022-11-08 11:25:04,461 R2G          INFO     Extending promoter annotation to 10 bp upstream and 10 downstream
2022-11-08 11:25:16,067 R2G          INFO     Extending search space to:
            						150000 bp downstream of the end of the gene.
            						150000 bp upstream of the start of the gene.
2022-11-08 11:25:57,552 R2G          INFO     Intersecting with regions.
join: Strand data from other will be added as strand data to self.
If this is undesired use the flag apply_strand_suffix=False.
To turn off the warning set apply_strand_suffix to True or False.
2022-11-08 11:25:59,162 R2G          INFO     Calculating distances from region to gene
2022-11-08 11:29:46,606 R2G          INFO     Imploding multiple entries per region and gene
2022-11-08 11:37:13,585 R2G          INFO     Done!
In [14]:
# infer enhancer to gene relationships
calculate_regions_to_genes_relationships(scplus_obj,
                    ray_n_cpu = 20,
                    _temp_dir = tmpDir,
                    importance_scoring_method = 'GBM',
                    importance_scoring_kwargs = GBM_KWARGS) 
2022-11-08 11:43:03,412 R2G          INFO     Calculating region to gene importances, using GBM method
2022-11-08 11:43:28,399	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265 
initializing:  78%|███████▊  | 4549/5812 [12:59<10:32,  2.00it/s](raylet) Spilled 2235 MiB, 195 objects, write throughput 135 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.
initializing:  79%|███████▉  | 4580/5812 [13:04<03:16,  6.25it/s](raylet) Spilled 4186 MiB, 369 objects, write throughput 203 MiB/s.
initializing:  82%|████████▏ | 4745/5812 [13:36<03:01,  5.88it/s](raylet) Spilled 29764 MiB, 2669 objects, write throughput 1034 MiB/s.
initializing:  82%|████████▏ | 4756/5812 [13:38<02:47,  6.31it/s](raylet) Spilled 30085 MiB, 2699 objects, write throughput 1026 MiB/s.
initializing:  82%|████████▏ | 4763/5812 [13:39<03:00,  5.80it/s](raylet) Spilled 75151 MiB, 6666 objects, write throughput 2469 MiB/s.
initializing:  82%|████████▏ | 4769/5812 [13:40<02:47,  6.24it/s](raylet) Spilled 75309 MiB, 6682 objects, write throughput 2446 MiB/s.
initializing: 100%|██████████| 5812/5812 [18:05<00:00,  5.36it/s]
Running using 20 cores: 100%|██████████| 5812/5812 [33:03<00:00,  2.93it/s] 
2022-11-08 12:34:43,328 R2G          INFO     Took 3099.9146208763123 seconds
2022-11-08 12:34:43,329 R2G          INFO     Calculating region to gene correlation, using SR method
2022-11-08 12:35:07,131	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265 
initializing: 100%|██████████| 5812/5812 [14:39<00:00,  6.61it/s]
Running using 20 cores: 100%|██████████| 5812/5812 [00:04<00:00, 1172.40it/s]
2022-11-08 12:49:57,251 R2G          INFO     Took 913.9199676513672 seconds
2022-11-08 12:50:02,485 R2G          INFO     Done!
In [15]:
# save
with open(outDir + 'zf_highToSomite_scplusObject_withEnhancerToGene.pkl', 'wb') as f:
    pickle.dump(scplus_obj, f)
In [8]:
infile = open(outDir + 'zf_highToSomite_scplusObject_withEnhancerToGene.pkl', 'rb')
scplus_obj = pickle.load(infile)
infile.close()
In [9]:
## infer TF to gene relationships
tf_file = os.path.join(projDir, 'data/motif_data/tf.txt')
calculate_TFs_to_genes_relationships(scplus_obj,
                    tf_file = tf_file,
                    ray_n_cpu = 20, 
                    method = 'GBM',
                    _temp_dir = tmpDir,
                    key= 'TF2G_adj')
2022-11-08 15:10:30,727	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265 
2022-11-08 15:10:32,710 TF2G         INFO     Calculating TF to gene correlation, using GBM method
initializing:   1%|▏         | 86/6054 [00:22<24:19,  4.09it/s]  (raylet) Spilled 4011 MiB, 8 objects, write throughput 780 MiB/s. Set RAY_verbose_spill_logs=0 to disable this message.
(raylet) Spilled 7020 MiB, 14 objects, write throughput 1140 MiB/s.
(raylet) Spilled 10920 MiB, 20 objects, write throughput 1476 MiB/s.
(raylet) Spilled 17997 MiB, 35 objects, write throughput 1704 MiB/s.
initializing:   2%|▏         | 103/6054 [01:02<51:58,  1.91it/s]  (raylet) Spilled 34157 MiB, 69 objects, write throughput 1703 MiB/s.
initializing:   2%|▏         | 113/6054 [01:18<5:16:22,  3.20s/it](raylet) Spilled 99294 MiB, 198 objects, write throughput 2794 MiB/s.
initializing:   2%|▏         | 146/6054 [01:33<1:17:59,  1.26it/s](raylet) Spilled 132392 MiB, 264 objects, write throughput 2721 MiB/s.
initializing:   5%|▍         | 300/6054 [02:21<23:18,  4.11it/s]  (raylet) Spilled 264784 MiB, 528 objects, write throughput 2954 MiB/s.
initializing:  10%|▉         | 576/6054 [03:44<17:29,  5.22it/s]  (raylet) Spilled 527619 MiB, 1053 objects, write throughput 3042 MiB/s.
initializing:  18%|█▊        | 1091/6054 [06:39<37:48,  2.19it/s]  (raylet) Spilled 1051113 MiB, 2096 objects, write throughput 3021 MiB/s.
initializing:  35%|███▍      | 2107/6054 [12:24<27:33,  2.39it/s]  (raylet) Spilled 2097212 MiB, 4182 objects, write throughput 3034 MiB/s.
initializing:  69%|██████▉   | 4203/6054 [29:51<08:28,  3.64it/s]  (raylet) Spilled 4197433 MiB, 8370 objects, write throughput 2646 MiB/s.
initializing: 100%|██████████| 6054/6054 [42:45<00:00,  2.36it/s]  
Running using 20 cores: 100%|██████████| 6054/6054 [55:02<00:00,  1.83it/s]  
2022-11-08 16:48:25,618 TF2G         INFO     Took 5872.906021118164 seconds
2022-11-08 16:48:25,620 TF2G         INFO     Adding correlation coefficients to adjacencies.
2022-11-08 16:48:37,842 TF2G         INFO     Warning: adding TFs as their own target to adjecencies matrix. Importance values will be max + 1e-05
2022-11-08 16:48:42,057 TF2G         INFO     Adding importance x rho scores to adjacencies.
2022-11-08 16:48:42,076 TF2G         INFO     Took 16.456542253494263 seconds
In [10]:
# save
with open(outDir + 'zf_highToSomite_scplusObject_withTfToGene.pkl', 'wb') as f:
    pickle.dump(scplus_obj, f)
In [18]:
infile = open(outDir + 'zf_highToSomite_scplusObject_withTfToGene.pkl', 'rb')
scplus_obj = pickle.load(infile)
infile.close()
In [19]:
# build eGRNs (it takes 10min)
build_grn(scplus_obj,
         min_target_genes = 10,
         adj_pval_thr = 1,
         min_regions_per_gene = 0,
         quantiles = (0.85, 0.90, 0.95),
         top_n_regionTogenes_per_gene = (5, 10, 15),
         top_n_regionTogenes_per_region = (),
         binarize_using_basc = True,
         rho_dichotomize_tf2g = True,
         rho_dichotomize_r2g = True,
         rho_dichotomize_eregulon = True,
         rho_threshold = 0.05,
         keep_extended_motif_annot = True, 
         merge_eRegulons = True,
         order_regions_to_genes_by = 'importance',
         order_TFs_to_genes_by = 'importance',
         key_added = 'eRegulons_importance',
         cistromes_key = 'Unfiltered',
         disable_tqdm = True,
         ray_n_cpu = 20,
         _temp_dir = tmpDir) 
2022-11-08 17:15:29,801 GSEA         INFO     Thresholding region to gene relationships
2022-11-08 17:15:50,692	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265 
2022-11-08 17:23:24,614 GSEA         INFO     Subsetting TF2G adjacencies for TF with motif.
2022-11-08 17:23:40,973	INFO worker.py:1509 -- Started a local Ray instance. View the dashboard at http://127.0.0.1:8265 
2022-11-08 17:23:42,899 GSEA         INFO     Running GSEA...
(_ray_run_gsea_for_e_module pid=226126) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:71: RuntimeWarning: divide by zero encountered in divide
(_ray_run_gsea_for_e_module pid=226126)   norm_tag =  1.0/sum_correl_tag
(_ray_run_gsea_for_e_module pid=226126) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:74: RuntimeWarning: invalid value encountered in multiply
(_ray_run_gsea_for_e_module pid=226126)   RES = np.cumsum(tag_indicator * correl_vector * norm_tag - no_tag_indicator * norm_no_tag, axis=axis)
(_ray_run_gsea_for_e_module pid=226130) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:71: RuntimeWarning: divide by zero encountered in divide
(_ray_run_gsea_for_e_module pid=226130)   norm_tag =  1.0/sum_correl_tag
(_ray_run_gsea_for_e_module pid=226130) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:74: RuntimeWarning: invalid value encountered in multiply
(_ray_run_gsea_for_e_module pid=226130)   RES = np.cumsum(tag_indicator * correl_vector * norm_tag - no_tag_indicator * norm_no_tag, axis=axis)
(_ray_run_gsea_for_e_module pid=226131) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:71: RuntimeWarning: divide by zero encountered in divide
(_ray_run_gsea_for_e_module pid=226131)   norm_tag =  1.0/sum_correl_tag
(_ray_run_gsea_for_e_module pid=226131) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:74: RuntimeWarning: invalid value encountered in multiply
(_ray_run_gsea_for_e_module pid=226131)   RES = np.cumsum(tag_indicator * correl_vector * norm_tag - no_tag_indicator * norm_no_tag, axis=axis)
(_ray_run_gsea_for_e_module pid=226113) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:71: RuntimeWarning: divide by zero encountered in divide
(_ray_run_gsea_for_e_module pid=226113)   norm_tag =  1.0/sum_correl_tag
(_ray_run_gsea_for_e_module pid=226113) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:74: RuntimeWarning: invalid value encountered in multiply
(_ray_run_gsea_for_e_module pid=226113)   RES = np.cumsum(tag_indicator * correl_vector * norm_tag - no_tag_indicator * norm_no_tag, axis=axis)
(_ray_run_gsea_for_e_module pid=226124) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:71: RuntimeWarning: divide by zero encountered in divide
(_ray_run_gsea_for_e_module pid=226124)   norm_tag =  1.0/sum_correl_tag
(_ray_run_gsea_for_e_module pid=226124) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:74: RuntimeWarning: invalid value encountered in multiply
(_ray_run_gsea_for_e_module pid=226124)   RES = np.cumsum(tag_indicator * correl_vector * norm_tag - no_tag_indicator * norm_no_tag, axis=axis)
(_ray_run_gsea_for_e_module pid=226125) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:71: RuntimeWarning: divide by zero encountered in divide
(_ray_run_gsea_for_e_module pid=226125)   norm_tag =  1.0/sum_correl_tag
(_ray_run_gsea_for_e_module pid=226125) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:74: RuntimeWarning: invalid value encountered in multiply
(_ray_run_gsea_for_e_module pid=226125)   RES = np.cumsum(tag_indicator * correl_vector * norm_tag - no_tag_indicator * norm_no_tag, axis=axis)
(_ray_run_gsea_for_e_module pid=226128) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:72: RuntimeWarning: divide by zero encountered in divide
(_ray_run_gsea_for_e_module pid=226128)   norm_no_tag = 1.0/Nmiss
(_ray_run_gsea_for_e_module pid=226128) /scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/gseapy/algorithm.py:74: RuntimeWarning: invalid value encountered in multiply
(_ray_run_gsea_for_e_module pid=226128)   RES = np.cumsum(tag_indicator * correl_vector * norm_tag - no_tag_indicator * norm_no_tag, axis=axis)
2022-11-08 17:25:36,582 GSEA         INFO     Subsetting on adjusted pvalue: 1, minimal NES: 0 and minimal leading edge genes 10
2022-11-08 17:25:37,671 GSEA         INFO     Merging eRegulons
2022-11-08 17:25:38,053 GSEA         INFO     Storing eRegulons in .uns[eRegulons_importance].
In [20]:
# save 
with open(outDir + 'zf_highToSomite_scplusObject_withGRN.pkl', 'wb') as f:
  dill.dump(scplus_obj, f)

7. Exploring SCENIC+ results¶

use this tutorial: https://scenicplus.readthedocs.io/en/latest/Scenicplus_step_by_step-RTD.html#

A. Generate eRegulon metadata¶
In [8]:
infile = open(outDir + 'zf_highToSomite_scplusObject_withGRN.pkl', 'rb')
scplus_obj = dill.load(infile) # (can't use pickle, only dill)
infile.close()
In [9]:
format_egrns(scplus_obj, eregulons_key = 'eRegulons_importance', TF2G_key = 'TF2G_adj', key_added = 'eRegulon_metadata')
In [10]:
scplus_obj.uns['eRegulon_metadata'][0:10]
Out[10]:
Region_signature_name Gene_signature_name TF is_extended Region Gene R2G_importance R2G_rho R2G_importance_x_rho R2G_importance_x_abs_rho TF2G_importance TF2G_regulation TF2G_rho TF2G_importance_x_abs_rho TF2G_importance_x_rho
0 TCF4_+_+_(2024r) TCF4_+_+_(734g) TCF4 False chr14:14644555-14645055 znf185 0.061208 0.295798 0.018105 0.018105 13.435855 1 0.425698 5.719616 5.719616
1 TCF4_+_+_(2024r) TCF4_+_+_(734g) TCF4 False chr14:14520133-14520633 znf185 0.017040 0.249921 0.004259 0.004259 13.435855 1 0.425698 5.719616 5.719616
2 TCF4_+_+_(2024r) TCF4_+_+_(734g) TCF4 False chr14:14568116-14568616 znf185 0.042576 0.286562 0.012201 0.012201 13.435855 1 0.425698 5.719616 5.719616
3 TCF4_+_+_(2024r) TCF4_+_+_(734g) TCF4 False chr14:14642999-14643499 znf185 0.086022 0.311011 0.026754 0.026754 13.435855 1 0.425698 5.719616 5.719616
4 TCF4_+_+_(2024r) TCF4_+_+_(734g) TCF4 False chr14:14642242-14642742 znf185 0.063553 0.396462 0.025196 0.025196 13.435855 1 0.425698 5.719616 5.719616
5 TCF4_+_+_(2024r) TCF4_+_+_(734g) TCF4 False chr5:30897269-30897769 spns3 0.042442 0.449666 0.019085 0.019085 15.414667 1 0.388184 5.983722 5.983722
6 TCF4_+_+_(2024r) TCF4_+_+_(734g) TCF4 False chr5:31034714-31035214 spns3 0.020183 0.196341 0.003963 0.003963 15.414667 1 0.388184 5.983722 5.983722
7 TCF4_+_+_(2024r) TCF4_+_+_(734g) TCF4 False chr5:30992062-30992562 spns3 0.022284 0.281541 0.006274 0.006274 15.414667 1 0.388184 5.983722 5.983722
8 TCF4_+_+_(2024r) TCF4_+_+_(734g) TCF4 False chr5:31092057-31092557 spns3 0.034984 0.339177 0.011866 0.011866 15.414667 1 0.388184 5.983722 5.983722
9 TCF4_+_+_(2024r) TCF4_+_+_(734g) TCF4 False chr5:30965755-30966255 spns3 0.071577 0.304456 0.021792 0.021792 15.414667 1 0.388184 5.983722 5.983722
In [137]:
scplus_obj.uns['eRegulon_metadata'].to_csv(outDir + 'scenicplus/eRegulon_metadata.csv',sep='\t',index=False)  
In [7]:
infile = open(outDir + 'zf_highToSomite_scplusObject_withGRN.pkl', 'rb')
scplus_obj = dill.load(infile) # (can't use pickle, only dill)
infile.close()
In [44]:
# output Cistromes

df = pd.DataFrame(columns = ['regulon','Chromosome','Start','End'])

for key in scplus_obj.uns['Cistromes']['Unfiltered']:
    df2 = scplus_obj.uns['Cistromes']['Unfiltered'][key]
    df2 = df2.df # change PyRanges as DataFrame
    df3 = pd.DataFrame({'regulon': np.repeat(key,df2.shape[0])}) # generate one data frame  
    df4 = pd.concat([df3, df2], axis=1) # combine two data frames
    
    df = df.append(df4)

df.to_csv("output/scenicplus/cistromes.txt",sep ='\t',index=False,header=True)
In [154]:
for key in scplus_obj.uns['Cistromes']['Unfiltered']:
    print (key)
AL807792.3_(13225r)
arnt_(118r)
arnt2_(6793r)
arntl1a_(623r)
arntl1b_(623r)
atf1_(9657r)
atf2_(5740r)
atf3_(2982r)
atf4a_(12592r)
atf4b_(12592r)
atf6_(20590r)
atf7a_(4588r)
atf7b_(4588r)
atoh1a_(13225r)
atoh1b_(13225r)
atoh1c_(13225r)
barhl1a_(66r)
barhl1b_(66r)
batf2_(2979r)
batf3_(2979r)
bhlha15_(23643r)
bhlhe22_(19794r)
bhlhe40_(92r)
bhlhe41_(2456r)
bloc1s2_(13225r)
bsx_(32r)
BX548005.1_(3475r)
CABZ01066694.1_(1487r)
CABZ01079241.1_(2325r)
CABZ01090890.1_(20590r)
cdx1a_(7r)
cdx1b_(7r)
cebp1_(227r)
cebpb_(4621r)
cebpd_(4507r)
clocka_(40r)
clockb_(40r)
CR354582.1_(405r)
creb1a_(14230r)
creb1b_(14230r)
creb3l1_(73r)
creb3l2_(228r)
creb5a_(17216r)
creb5b_(17216r)
crema_(14230r)
cremb_(14230r)
crx_(614r)
CT027745.1_(18540r)
CT573494.3_(4802r)
ctcf_(15916r)
cxxc1a_(473r)
cxxc1b_(473r)
ddit3_(279r)
dharma_(3830r)
dlx5a_(21r)
dmbx1a_(529r)
dmbx1b_(529r)
drgx_(496r)
e2f1_(4744r)
e2f2_(20585r)
e2f3_(1042r)
e2f4_(57592r)
e2f5_(8878r)
e2f6_(2138r)
e2f7_(72025r)
e2f8_(5r)
e4f1_(7335r)
ebf3a-1_(12763r)
ebf3a-2_(1227r)
ebf3b_(1227r)
egr1_(738r)
egr2a_(7766r)
egr2b_(7766r)
egr3_(1400r)
ehf_(218r)
elf1_(3573r)
elf2a_(2069r)
elf2b_(2069r)
elf3_(183r)
elk1_(3240r)
elk3_(888r)
elk4_(3189r)
emx1_(330r)
en1b_(193r)
en2a_(1770r)
en2b_(1770r)
eomesa_(1710r)
eomesb_(1710r)
erf_(1844r)
erfl3_(1844r)
erg_(1158r)
esrrb_(14806r)
ets2_(1035r)
etv1_(3141r)
etv2_(1086r)
etv4_(103r)
etv5a_(4254r)
etv5b_(4254r)
etv7_(1043r)
evx2_(3429r)
fev_(27688r)
figla_(1127r)
fli1a_(27688r)
fli1b_(256r)
fosaa_(4630r)
fosab_(4630r)
fosl1a_(4630r)
fosl1b_(7484r)
foxa_(1993r)
foxa1_(2248r)
foxa2_(1212r)
foxd1_(189r)
foxd3_(884r)
foxd5_(8r)
foxd7_(189r)
foxg1a_(10915r)
foxg1b_(645r)
foxg1c_(10915r)
foxg1d_(10915r)
foxn1_(31761r)
foxo1a_(875r)
foxo1b_(875r)
foxo3b_(46855r)
foxo6a_(46855r)
foxo6b_(46855r)
foxp2_(613r)
foxq2_(77r)
gabpa_(41829r)
gata1a_(365r)
gata1b_(24739r)
gata2a_(24739r)
gata2b_(365r)
gata3_(131r)
gata4_(5577r)
gata5_(448r)
gata6_(2450r)
gatad2ab_(2498r)
gbx1_(270r)
gcm2_(37r)
gli1_(787r)
gli2a_(177r)
gli2b_(787r)
gli3_(177r)
glis1a_(1763r)
glis1b_(1763r)
glis2a_(1961r)
glis2b_(1961r)
glis3_(3425r)
gmeb1_(15r)
gmeb2_(22r)
grhl1_(32767r)
gsc_(3471r)
gsx1_(17r)
her1_(59r)
her11_(59r)
her12_(51r)
her15.1_(51r)
her15.2_(51r)
her2_(51r)
her4.1_(51r)
her4.2_(51r)
her4.3_(51r)
her4.4_(51r)
her5_(59r)
her6_(11807r)
her8.2_(11807r)
her9_(11807r)
hey1_(90794r)
hey2_(54r)
heyl_(90794r)
hic1_(71114r)
hinfp_(59436r)
hmx4_(140r)
hnf1a_(50r)
hnf1ba_(600r)
hnf1bb_(600r)
hnf4a_(4278r)
hnf4b_(4278r)
hnf4g_(1529r)
homeza_(8r)
homezb_(8r)
hoxa11a_(309r)
hoxa13a_(72r)
hoxa13b_(72r)
hoxa1a_(367r)
hoxa2b_(840r)
hoxa4a_(230r)
hoxa9a_(1829r)
hoxa9b_(1829r)
hoxb1a_(849r)
hoxb1b-1_(367r)
hoxb1b-2_(849r)
hoxb3a_(68r)
hoxb4a_(1982r)
hoxb6a_(1201r)
hoxb6b_(1201r)
hoxb8a_(781r)
hoxb8b_(781r)
hoxc12a_(3704r)
hoxc12b_(3704r)
hoxc13a_(669r)
hoxc1a_(367r)
hoxc3a_(230r)
hoxc9a_(1783r)
hoxd12a_(2821r)
hsf1_(341r)
hsf4_(341r)
id4_(1024r)
ikzf1_(76r)
insm1a_(2025r)
insm1b_(2025r)
irf10_(25526r)
irf3_(25526r)
irf5_(856r)
irf8_(25526r)
irf9_(25526r)
jdp2a_(7484r)
jdp2b_(4823r)
jun_(2498r)
junba_(3488r)
junbb_(3488r)
june_(34394r)
klf12a_(51519r)
klf13_(2745r)
klf15_(46788r)
klf3_(717r)
klf4_(13371r)
klf5l_(7281r)
klf6a_(7281r)
klf6b_(7281r)
klf7a_(449r)
klf7b_(449r)
klf8_(717r)
lef1_(33r)
lhx6_(32r)
lhx6-1_(90r)
lhx8a_(7620r)
lhx8b_(7620r)
mafa_(2327r)
mafb_(239r)
mafbb_(2327r)
maff_(58r)
mafga_(58r)
mafgb_(58r)
max_(2390r)
maza_(54030r)
mazb_(54030r)
mbd1a_(473r)
mbd1b_(473r)
mbd2_(48347r)
mecom_(123r)
mecp2_(23406r)
mef2aa_(13r)
mef2ab_(13r)
mef2b_(44011r)
mef2d_(13r)
meis3_(1224r)
meox1_(154r)
meox2a_(30r)
mespaa_(842r)
mespab_(842r)
mespba_(842r)
mespbb_(842r)
mgaa_(4160r)
mlx_(56123r)
mlxipl_(392r)
mnta_(46r)
mntb_(46r)
msc_(6406r)
msx1a_(32r)
msx1b_(16r)
msx2b_(55r)
msx3_(16r)
mta3_(7392r)
mtf1_(34911r)
mxi1_(872r)
myb_(128r)
myca_(838r)
mycb_(6095r)
mych_(838r)
mycn_(909r)
myf6_(2771r)
myod1_(2422r)
myog_(7268r)
nanog_(32r)
neurod1_(7955r)
neurod2_(543r)
nfatc2a_(33r)
nfe2l3_(2183r)
nfia_(13r)
nfkb1_(206r)
nfya_(4123r)
nfyal_(4713r)
nkx1.2lb_(965r)
nkx2.1_(18r)
npas2_(40r)
nr1h4_(195r)
nr1i2_(51r)
nr2c1_(3254r)
nr2c2_(15000r)
nr2f1a_(3062r)
nr2f1b_(177r)
nr2f2_(4755r)
nr2f5_(177r)
nr2f6a_(4449r)
nr2f6b_(4449r)
nr4a3_(2516r)
nr6a1b_(9206r)
nrl_(239r)
olig1_(11310r)
onecut1_(11608r)
onecutl_(11608r)
otx1_(6329r)
otx2a_(6842r)
otx2b_(1310r)
otx5_(614r)
patz1_(68153r)
pax1a_(428r)
pax1b_(428r)
pax2a_(395r)
pax2b_(71r)
pax5_(395r)
pax8_(6r)
pax9_(688r)
pbx1a_(34r)
pbx1b_(34r)
pbx2_(34r)
pbx3a_(34r)
pbx3b_(34r)
pdx1_(1502r)
pgm3_(8r)
phox2ba_(39r)
pitx1_(19r)
pitx2_(508r)
pitx3_(19r)
pknox2_(2214r)
plag1_(2115r)
plagx_(2115r)
pou2f2a_(8394r)
pou2f3_(440r)
pou3f1_(753r)
pou4f3_(15r)
pparaa_(2903r)
pparda_(2903r)
ppardb_(2903r)
prdm14_(34r)
prdm2a_(24571r)
prdm4_(569r)
prox1a_(22297r)
prox2_(22297r)
prox3_(22297r)
prrx1a_(695r)
prrx1b_(695r)
raraa_(29r)
rarab_(29r)
rarga_(179r)
rargb_(74r)
rcor1_(1142r)
rest_(3189r)
rfx1a_(1216r)
rfx1b_(292r)
rfx2_(1444r)
rfx3_(6463r)
rfx4_(1444r)
rfx7a_(86r)
rfx7b_(86r)
rlf_(123r)
rreb1a_(13284r)
rreb1b_(13284r)
rsl1d1_(6559r)
runx1_(3r)
runx3_(3226r)
rxraa_(4619r)
rxrab_(4619r)
rxrba_(4619r)
rxrbb_(4114r)
rxrgb_(4619r)
scrt1a_(956r)
scrt1b_(956r)
scrt2_(956r)
si:ch211-117k10.3_(46788r)
si:ch211-153j24.3_(2598r)
si:ch211-51e8.2_(12763r)
si:ch211-69l10.4_(17177r)
si:ch73-59f11.3_(2390r)
si:dkey-149i17.7_(3549r)
si:dkey-229b18.3_(302r)
si:dkey-56e3.3_(26r)
si:dkeyp-79b7.12_(7335r)
si:rp71-45k5.2_(645r)
sinhcafl_(123r)
six1a_(213r)
six2a_(213r)
six2b_(213r)
six3a_(676r)
six3b_(676r)
six4a_(13r)
six4b_(13r)
six5_(213r)
six7_(676r)
six9_(213r)
smad1_(81r)
smad10a_(19r)
smad2_(4r)
smad3a_(9r)
smad3b_(9r)
smad4a_(19r)
smarcc1a_(2r)
smarcc1b_(2r)
smarcc2_(3475r)
snai1a_(2769r)
snai1b_(1435r)
snai2_(302r)
snai3_(1435r)
sox10_(221r)
sox12_(19r)
sox19a_(812r)
sox19b_(812r)
sox2_(795r)
sox3_(812r)
sox6_(258r)
sox8a_(48r)
sox8b_(48r)
sox9a_(2967r)
sox9b_(2967r)
sp1_(46071r)
sp2_(6197r)
sp3a_(6559r)
sp3b_(51613r)
sp4_(7887r)
sp8a_(4847r)
sp8b_(4847r)
SPDEF_(19958r)
spi1a_(56r)
spi1b_(56r)
srebf1_(27928r)
srebf2_(178r)
srfa_(58r)
srfb_(58r)
stat5a_(5314r)
stat5b_(5314r)
stat6_(14034r)
tal1_(2529r)
tbr1a_(6665r)
tbr1b_(6665r)
tbx1_(16675r)
tbx15_(17177r)
tbx16_(4200r)
tbx16l_(4200r)
tbx19_(18540r)
tbx20_(1375r)
tbx21_(6665r)
tbx2a_(4200r)
tbx2b_(4200r)
tbx4_(4112r)
tbx5a_(2319r)
tbx5b_(2319r)
tbxta_(15246r)
tbxtb_(15246r)
tcf12_(14837r)
tcf21_(22386r)
tcf3a_(12259r)
tcf3b_(12259r)
TCF4_(16691r)
tcf7_(568r)
tcf7l1a_(1838r)
tcf7l1b_(1838r)
tcf7l2_(1838r)
tead1a_(4802r)
tead1b_(4422r)
tead3a_(10088r)
tead3b_(10088r)
tefa_(4047r)
tefb_(4047r)
tfap2a_(26430r)
tfap2b_(2871r)
tfap2c_(26430r)
tfap2d_(61543r)
tfap2e_(31698r)
tfap4_(10094r)
tfcp2_(4835r)
tfcp2l1_(4835r)
tfdp1a_(66653r)
tfdp1b_(66653r)
tfeb_(1725r)
tlx1_(2930r)
tp53_(4522r)
tp73_(445r)
ttf1.4_(2771r)
twist1a_(52r)
twist1b_(52r)
twist3_(52r)
ubp1_(4835r)
usf1_(216r)
usf1l_(19340r)
usf2_(2064r)
VDR_(51r)
vdrb_(298r)
ved_(7r)
wt1a_(19807r)
wt1b_(19807r)
xbp1_(21r)
ybx1_(565r)
yy1a_(115r)
yy1b_(115r)
zbtb14_(66514r)
zbtb18_(12835r)
zbtb3_(24r)
zbtb42_(12835r)
zbtb47b_(20798r)
zbtb49_(31r)
zbtb7a_(56r)
zbtb7b_(5948r)
zbtb7c_(56r)
zeb1a_(349r)
zeb1b_(349r)
zfx_(2325r)
zgc:112083_(21136r)
zgc:113424_(10915r)
zgc:153115_(2745r)
zic1_(3003r)
zic2a_(4655r)
zic2b_(4655r)
zic3_(241r)
zic4_(32032r)
zic6_(32032r)
znf143b_(29475r)
znf148_(39387r)
znf423_(5715r)
znf598_(13284r)
znf652_(20798r)
znf711_(2325r)
znf740a_(34273r)
znf740b_(34273r)
AL807792.3_extended_(13225r)
arnt2_extended_(6793r)
arnt_extended_(118r)
arntl1a_extended_(623r)
arntl1b_extended_(623r)
atf1_extended_(9657r)
atf2_extended_(5740r)
atf3_extended_(2982r)
atf4a_extended_(12592r)
atf4b_extended_(12592r)
atf6_extended_(20590r)
atf7a_extended_(4588r)
atf7b_extended_(4588r)
atoh1a_extended_(13225r)
atoh1b_extended_(13225r)
atoh1c_extended_(13225r)
barhl1a_extended_(66r)
barhl1b_extended_(66r)
batf2_extended_(2979r)
batf3_extended_(2979r)
bhlha15_extended_(23643r)
bhlhe22_extended_(19794r)
bhlhe40_extended_(92r)
bhlhe41_extended_(2456r)
bloc1s2_extended_(13225r)
bsx_extended_(32r)
BX548005.1_extended_(3475r)
CABZ01066694.1_extended_(1487r)
CABZ01079241.1_extended_(2325r)
CABZ01090890.1_extended_(20590r)
cdx1a_extended_(7r)
cdx1b_extended_(7r)
cebp1_extended_(227r)
cebpb_extended_(4621r)
cebpd_extended_(4507r)
clocka_extended_(40r)
clockb_extended_(40r)
CR354582.1_extended_(405r)
creb1a_extended_(14230r)
creb1b_extended_(14230r)
creb3l1_extended_(73r)
creb3l2_extended_(228r)
creb5a_extended_(17216r)
creb5b_extended_(17216r)
crema_extended_(14230r)
cremb_extended_(14230r)
crx_extended_(614r)
CT027745.1_extended_(18540r)
CT573494.3_extended_(4802r)
ctcf_extended_(15916r)
cxxc1a_extended_(473r)
cxxc1b_extended_(473r)
ddit3_extended_(279r)
dharma_extended_(3830r)
dlx5a_extended_(21r)
dmbx1a_extended_(529r)
dmbx1b_extended_(529r)
drgx_extended_(496r)
e2f1_extended_(4744r)
e2f2_extended_(20585r)
e2f3_extended_(1042r)
e2f4_extended_(57592r)
e2f5_extended_(8878r)
e2f6_extended_(2138r)
e2f7_extended_(72025r)
e2f8_extended_(5r)
e4f1_extended_(7335r)
ebf3a-1_extended_(12763r)
ebf3a-2_extended_(1227r)
ebf3b_extended_(1227r)
egr1_extended_(738r)
egr2a_extended_(7766r)
egr2b_extended_(7766r)
egr3_extended_(1400r)
ehf_extended_(218r)
elf1_extended_(3573r)
elf2a_extended_(2069r)
elf2b_extended_(2069r)
elf3_extended_(183r)
elk1_extended_(3240r)
elk3_extended_(888r)
elk4_extended_(3189r)
emx1_extended_(330r)
en1b_extended_(193r)
en2a_extended_(1770r)
en2b_extended_(1770r)
eomesa_extended_(1710r)
eomesb_extended_(1710r)
erf_extended_(1844r)
erfl3_extended_(1844r)
erg_extended_(1158r)
esrrb_extended_(14806r)
ets2_extended_(1035r)
etv1_extended_(3141r)
etv2_extended_(1086r)
etv4_extended_(103r)
etv5a_extended_(4254r)
etv5b_extended_(4254r)
etv7_extended_(1043r)
evx2_extended_(3429r)
fev_extended_(27688r)
figla_extended_(1127r)
fli1a_extended_(27688r)
fli1b_extended_(256r)
fosaa_extended_(4630r)
fosab_extended_(4630r)
fosl1a_extended_(4630r)
fosl1b_extended_(7484r)
foxa1_extended_(2248r)
foxa2_extended_(1212r)
foxa_extended_(1993r)
foxd1_extended_(189r)
foxd3_extended_(884r)
foxd5_extended_(8r)
foxd7_extended_(189r)
foxg1a_extended_(10915r)
foxg1b_extended_(645r)
foxg1c_extended_(10915r)
foxg1d_extended_(10915r)
foxn1_extended_(31761r)
foxo1a_extended_(875r)
foxo1b_extended_(875r)
foxo3b_extended_(46855r)
foxo6a_extended_(46855r)
foxo6b_extended_(46855r)
foxp2_extended_(613r)
foxq2_extended_(77r)
gabpa_extended_(41829r)
gata1a_extended_(365r)
gata1b_extended_(24739r)
gata2a_extended_(24739r)
gata2b_extended_(365r)
gata3_extended_(131r)
gata4_extended_(5577r)
gata5_extended_(448r)
gata6_extended_(2450r)
gatad2ab_extended_(2498r)
gbx1_extended_(270r)
gcm2_extended_(37r)
gli1_extended_(787r)
gli2a_extended_(177r)
gli2b_extended_(787r)
gli3_extended_(177r)
glis1a_extended_(1763r)
glis1b_extended_(1763r)
glis2a_extended_(1961r)
glis2b_extended_(1961r)
glis3_extended_(3425r)
gmeb1_extended_(15r)
gmeb2_extended_(22r)
grhl1_extended_(32767r)
gsc_extended_(3471r)
gsx1_extended_(17r)
her11_extended_(59r)
her12_extended_(51r)
her15.1_extended_(51r)
her15.2_extended_(51r)
her1_extended_(59r)
her2_extended_(51r)
her4.1_extended_(51r)
her4.2_extended_(51r)
her4.3_extended_(51r)
her4.4_extended_(51r)
her5_extended_(59r)
her6_extended_(11807r)
her8.2_extended_(11807r)
her9_extended_(11807r)
hey1_extended_(90794r)
hey2_extended_(54r)
heyl_extended_(90794r)
hic1_extended_(71114r)
hinfp_extended_(59436r)
hmx4_extended_(140r)
hnf1a_extended_(50r)
hnf1ba_extended_(600r)
hnf1bb_extended_(600r)
hnf4a_extended_(4278r)
hnf4b_extended_(4278r)
hnf4g_extended_(1529r)
homeza_extended_(8r)
homezb_extended_(8r)
hoxa11a_extended_(309r)
hoxa13a_extended_(72r)
hoxa13b_extended_(72r)
hoxa1a_extended_(367r)
hoxa2b_extended_(840r)
hoxa4a_extended_(230r)
hoxa9a_extended_(1829r)
hoxa9b_extended_(1829r)
hoxb1a_extended_(849r)
hoxb1b-1_extended_(367r)
hoxb1b-2_extended_(849r)
hoxb3a_extended_(68r)
hoxb4a_extended_(1982r)
hoxb6a_extended_(1201r)
hoxb6b_extended_(1201r)
hoxb8a_extended_(781r)
hoxb8b_extended_(781r)
hoxc12a_extended_(3704r)
hoxc12b_extended_(3704r)
hoxc13a_extended_(669r)
hoxc1a_extended_(367r)
hoxc3a_extended_(230r)
hoxc9a_extended_(1783r)
hoxd12a_extended_(2821r)
hsf1_extended_(341r)
hsf4_extended_(341r)
id4_extended_(1024r)
ikzf1_extended_(76r)
insm1a_extended_(2025r)
insm1b_extended_(2025r)
irf10_extended_(25526r)
irf3_extended_(25526r)
irf5_extended_(856r)
irf8_extended_(25526r)
irf9_extended_(25526r)
jdp2a_extended_(7484r)
jdp2b_extended_(4823r)
jun_extended_(2498r)
junba_extended_(3488r)
junbb_extended_(3488r)
june_extended_(34394r)
klf12a_extended_(51519r)
klf13_extended_(2745r)
klf15_extended_(46788r)
klf3_extended_(717r)
klf4_extended_(13371r)
klf5l_extended_(7281r)
klf6a_extended_(7281r)
klf6b_extended_(7281r)
klf7a_extended_(449r)
klf7b_extended_(449r)
klf8_extended_(717r)
lef1_extended_(33r)
lhx6-1_extended_(90r)
lhx6_extended_(32r)
lhx8a_extended_(7620r)
lhx8b_extended_(7620r)
mafa_extended_(2327r)
mafb_extended_(239r)
mafbb_extended_(2327r)
maff_extended_(58r)
mafga_extended_(58r)
mafgb_extended_(58r)
max_extended_(2390r)
maza_extended_(54030r)
mazb_extended_(54030r)
mbd1a_extended_(473r)
mbd1b_extended_(473r)
mbd2_extended_(48347r)
mecom_extended_(123r)
mecp2_extended_(23406r)
mef2aa_extended_(13r)
mef2ab_extended_(13r)
mef2b_extended_(44011r)
mef2d_extended_(13r)
meis3_extended_(1224r)
meox1_extended_(154r)
meox2a_extended_(30r)
mespaa_extended_(842r)
mespab_extended_(842r)
mespba_extended_(842r)
mespbb_extended_(842r)
mgaa_extended_(4160r)
mlx_extended_(56123r)
mlxipl_extended_(392r)
mnta_extended_(46r)
mntb_extended_(46r)
msc_extended_(6406r)
msx1a_extended_(32r)
msx1b_extended_(16r)
msx2b_extended_(55r)
msx3_extended_(16r)
mta3_extended_(7392r)
mtf1_extended_(34911r)
mxi1_extended_(872r)
myb_extended_(128r)
myca_extended_(838r)
mycb_extended_(6095r)
mych_extended_(838r)
mycn_extended_(909r)
myf6_extended_(2771r)
myod1_extended_(2422r)
myog_extended_(7268r)
nanog_extended_(32r)
neurod1_extended_(7955r)
neurod2_extended_(543r)
nfatc2a_extended_(33r)
nfe2l3_extended_(2183r)
nfia_extended_(13r)
nfkb1_extended_(206r)
nfya_extended_(4123r)
nfyal_extended_(4713r)
nkx1.2lb_extended_(965r)
nkx2.1_extended_(18r)
npas2_extended_(40r)
nr1h4_extended_(195r)
nr1i2_extended_(51r)
nr2c1_extended_(3254r)
nr2c2_extended_(15000r)
nr2f1a_extended_(3062r)
nr2f1b_extended_(177r)
nr2f2_extended_(4755r)
nr2f5_extended_(177r)
nr2f6a_extended_(4449r)
nr2f6b_extended_(4449r)
nr4a3_extended_(2516r)
nr6a1b_extended_(9206r)
nrl_extended_(239r)
olig1_extended_(11310r)
onecut1_extended_(11608r)
onecutl_extended_(11608r)
otx1_extended_(6329r)
otx2a_extended_(6842r)
otx2b_extended_(1310r)
otx5_extended_(614r)
patz1_extended_(68153r)
pax1a_extended_(428r)
pax1b_extended_(428r)
pax2a_extended_(395r)
pax2b_extended_(71r)
pax5_extended_(395r)
pax8_extended_(6r)
pax9_extended_(688r)
pbx1a_extended_(34r)
pbx1b_extended_(34r)
pbx2_extended_(34r)
pbx3a_extended_(34r)
pbx3b_extended_(34r)
pdx1_extended_(1502r)
pgm3_extended_(8r)
phox2ba_extended_(39r)
pitx1_extended_(19r)
pitx2_extended_(508r)
pitx3_extended_(19r)
pknox2_extended_(2214r)
plag1_extended_(2115r)
plagx_extended_(2115r)
pou2f2a_extended_(8394r)
pou2f3_extended_(440r)
pou3f1_extended_(753r)
pou4f3_extended_(15r)
pparaa_extended_(2903r)
pparda_extended_(2903r)
ppardb_extended_(2903r)
prdm14_extended_(34r)
prdm2a_extended_(24571r)
prdm4_extended_(569r)
prox1a_extended_(22297r)
prox2_extended_(22297r)
prox3_extended_(22297r)
prrx1a_extended_(695r)
prrx1b_extended_(695r)
raraa_extended_(29r)
rarab_extended_(29r)
rarga_extended_(179r)
rargb_extended_(74r)
rcor1_extended_(1142r)
rest_extended_(3189r)
rfx1a_extended_(1216r)
rfx1b_extended_(292r)
rfx2_extended_(1444r)
rfx3_extended_(6463r)
rfx4_extended_(1444r)
rfx7a_extended_(86r)
rfx7b_extended_(86r)
rlf_extended_(123r)
rreb1a_extended_(13284r)
rreb1b_extended_(13284r)
rsl1d1_extended_(6559r)
runx1_extended_(3r)
runx3_extended_(3226r)
rxraa_extended_(4619r)
rxrab_extended_(4619r)
rxrba_extended_(4619r)
rxrbb_extended_(4114r)
rxrgb_extended_(4619r)
scrt1a_extended_(956r)
scrt1b_extended_(956r)
scrt2_extended_(956r)
si:ch211-117k10.3_extended_(46788r)
si:ch211-153j24.3_extended_(2598r)
si:ch211-51e8.2_extended_(12763r)
si:ch211-69l10.4_extended_(17177r)
si:ch73-59f11.3_extended_(2390r)
si:dkey-149i17.7_extended_(3549r)
si:dkey-229b18.3_extended_(302r)
si:dkey-56e3.3_extended_(26r)
si:dkeyp-79b7.12_extended_(7335r)
si:rp71-45k5.2_extended_(645r)
sinhcafl_extended_(123r)
six1a_extended_(213r)
six2a_extended_(213r)
six2b_extended_(213r)
six3a_extended_(676r)
six3b_extended_(676r)
six4a_extended_(13r)
six4b_extended_(13r)
six5_extended_(213r)
six7_extended_(676r)
six9_extended_(213r)
smad10a_extended_(19r)
smad1_extended_(81r)
smad2_extended_(4r)
smad3a_extended_(9r)
smad3b_extended_(9r)
smad4a_extended_(19r)
smarcc1a_extended_(2r)
smarcc1b_extended_(2r)
smarcc2_extended_(3475r)
snai1a_extended_(2769r)
snai1b_extended_(1435r)
snai2_extended_(302r)
snai3_extended_(1435r)
sox10_extended_(221r)
sox12_extended_(19r)
sox19a_extended_(812r)
sox19b_extended_(812r)
sox2_extended_(795r)
sox3_extended_(812r)
sox6_extended_(258r)
sox8a_extended_(48r)
sox8b_extended_(48r)
sox9a_extended_(2967r)
sox9b_extended_(2967r)
sp1_extended_(46071r)
sp2_extended_(6197r)
sp3a_extended_(6559r)
sp3b_extended_(51613r)
sp4_extended_(7887r)
sp8a_extended_(4847r)
sp8b_extended_(4847r)
SPDEF_extended_(19958r)
spi1a_extended_(56r)
spi1b_extended_(56r)
srebf1_extended_(27928r)
srebf2_extended_(178r)
srfa_extended_(58r)
srfb_extended_(58r)
stat5a_extended_(5314r)
stat5b_extended_(5314r)
stat6_extended_(14034r)
tal1_extended_(2529r)
tbr1a_extended_(6665r)
tbr1b_extended_(6665r)
tbx15_extended_(17177r)
tbx16_extended_(4200r)
tbx16l_extended_(4200r)
tbx19_extended_(18540r)
tbx1_extended_(16675r)
tbx20_extended_(1375r)
tbx21_extended_(6665r)
tbx2a_extended_(4200r)
tbx2b_extended_(4200r)
tbx4_extended_(4112r)
tbx5a_extended_(2319r)
tbx5b_extended_(2319r)
tbxta_extended_(15246r)
tbxtb_extended_(15246r)
tcf12_extended_(14837r)
tcf21_extended_(22386r)
tcf3a_extended_(12259r)
tcf3b_extended_(12259r)
TCF4_extended_(16691r)
tcf7_extended_(568r)
tcf7l1a_extended_(1838r)
tcf7l1b_extended_(1838r)
tcf7l2_extended_(1838r)
tead1a_extended_(4802r)
tead1b_extended_(4422r)
tead3a_extended_(10088r)
tead3b_extended_(10088r)
tefa_extended_(4047r)
tefb_extended_(4047r)
tfap2a_extended_(26430r)
tfap2b_extended_(2871r)
tfap2c_extended_(26430r)
tfap2d_extended_(61543r)
tfap2e_extended_(31698r)
tfap4_extended_(10094r)
tfcp2_extended_(4835r)
tfcp2l1_extended_(4835r)
tfdp1a_extended_(66653r)
tfdp1b_extended_(66653r)
tfeb_extended_(1725r)
tlx1_extended_(2930r)
tp53_extended_(4522r)
tp73_extended_(445r)
ttf1.4_extended_(2771r)
twist1a_extended_(52r)
twist1b_extended_(52r)
twist3_extended_(52r)
ubp1_extended_(4835r)
usf1_extended_(216r)
usf1l_extended_(19340r)
usf2_extended_(2064r)
VDR_extended_(51r)
vdrb_extended_(298r)
ved_extended_(7r)
wt1a_extended_(19807r)
wt1b_extended_(19807r)
xbp1_extended_(21r)
ybx1_extended_(565r)
yy1a_extended_(115r)
yy1b_extended_(115r)
zbtb14_extended_(66514r)
zbtb18_extended_(12835r)
zbtb3_extended_(24r)
zbtb42_extended_(12835r)
zbtb47b_extended_(20798r)
zbtb49_extended_(31r)
zbtb7a_extended_(56r)
zbtb7b_extended_(5948r)
zbtb7c_extended_(56r)
zeb1a_extended_(349r)
zeb1b_extended_(349r)
zfx_extended_(2325r)
zgc:112083_extended_(21136r)
zgc:113424_extended_(10915r)
zgc:153115_extended_(2745r)
zic1_extended_(3003r)
zic2a_extended_(4655r)
zic2b_extended_(4655r)
zic3_extended_(241r)
zic4_extended_(32032r)
zic6_extended_(32032r)
znf143b_extended_(29475r)
znf148_extended_(39387r)
znf423_extended_(5715r)
znf598_extended_(13284r)
znf652_extended_(20798r)
znf711_extended_(2325r)
znf740a_extended_(34273r)
znf740b_extended_(34273r)
In [46]:
# output region_to_gene 
scplus_obj.uns['region_to_gene'].to_csv("output/scenicplus/region_to_gene.txt",sep ='\t',index=False,header=True)
In [47]:
# output TF2G_adj 
scplus_obj.uns['TF2G_adj'].to_csv("output/scenicplus/TF2G_adj.txt",sep ='\t',index=False,header=True)
In [48]:
# output search_space 
scplus_obj.uns['search_space'].to_csv("output/scenicplus/search_space.txt",sep ='\t',index=False,header=True)
B. Assesing eGRN enrichment in cells¶
In [11]:
# format eRegulons
get_eRegulons_as_signatures(scplus_obj, eRegulon_metadata_key='eRegulon_metadata', key_added='eRegulon_signatures')
In [12]:
# score chromatin layer
import time
start_time = time.time()
region_ranking = make_rankings(scplus_obj, target='region')
score_eRegulons(scplus_obj,
                ranking = region_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC',
                enrichment_type= 'region',
                auc_threshold = 0.05,
                normalize = False,
                n_cpu = 1) # if we use more than 1 cpu, there is no enough memory even we ask 270g
time = time.time()-start_time
print(time/60)
100%|██████████| 486/486 [02:51<00:00,  2.83it/s]
43.07801492214203
In [13]:
# score transcriptome layer
import time
start_time = time.time()
gene_ranking = make_rankings(scplus_obj, target='gene')
score_eRegulons(scplus_obj,
                ranking = gene_ranking,
                eRegulon_signatures_key = 'eRegulon_signatures',
                key_added = 'eRegulon_AUC',
                enrichment_type= 'gene',
                auc_threshold = 0.05,
                normalize = False,
                n_cpu = 1)
time = time.time()-start_time
print(time/60)
100%|██████████| 486/486 [01:29<00:00,  5.42it/s]
2.0606292963027952
In [14]:
with open(outDir + 'scenicplus/zf_highToSomite_scplusObject_withAUC.pkl', 'wb') as f:
  dill.dump(scplus_obj, f)
In [139]:
scplus_obj.uns.keys()
Out[139]:
dict_keys(['Cistromes', 'search_space', 'region_to_gene', 'TF2G_adj', 'eRegulons_importance', 'eRegulon_metadata', 'eRegulon_signatures', 'eRegulon_AUC', 'Pseudobulk', 'TF_cistrome_correlation', 'selected_eRegulons', 'eRegulon_AUC_thresholds', 'RSS'])
In [ ]:
scplus_obj.uns['eRegulon_AUC']
C. Assessing TF-eGRN relationships¶
In [56]:
infile = open(outDir + 'scenicplus/zf_highToSomite_scplusObject_withAUC.pkl', 'rb')
scplus_obj = dill.load(infile) # (can't use pickle, only dill)
infile.close()
In [37]:
# generate pseudobulks
import time
start_time = time.time()
generate_pseudobulks(scplus_obj,
                         variable = 'GEX_cell.type',
                         auc_key = 'eRegulon_AUC',
                         signature_key = 'Gene_based',
                         nr_cells = 5,
                         nr_pseudobulks = 100,
                         seed=555)
generate_pseudobulks(scplus_obj,
                         variable = 'GEX_cell.type',
                         auc_key = 'eRegulon_AUC',
                         signature_key = 'Region_based',
                         nr_cells = 5,
                         nr_pseudobulks = 100,
                         seed=555)
time = time.time()-start_time
print(time/60)
0.9698616782824199
In [38]:
# correlation between TF and eRegulons
import time
start_time = time.time()
TF_cistrome_correlation(scplus_obj,
                        variable = 'GEX_cell.type',
                        auc_key = 'eRegulon_AUC',
                        signature_key = 'Gene_based',
                        out_key = 'GEX_cell.type_eGRN_gene_based')
TF_cistrome_correlation(scplus_obj,
                        variable = 'GEX_cell.type',
                        auc_key = 'eRegulon_AUC',
                        signature_key = 'Region_based',
                        out_key = 'GEX_cell.type_eGRN__region_based')
time = time.time()-start_time
print(time/60)
0.019954482714335125
In [124]:
scplus_obj.uns['TF_cistrome_correlation']['GEX_cell.type_eGRN_gene_based'].head()
Out[124]:
TF Cistrome Rho P-value Adjusted_p-value
0 TCF4 TCF4_+_+_(734g) 0.643763 0.000000e+00 0.000000e+00
1 TCF4 TCF4_+_-_(276g) 0.637050 0.000000e+00 0.000000e+00
2 TCF4 TCF4_-_+_(14g) -0.375055 4.712733e-315 5.289580e-315
3 TCF4 TCF4_extended_+_+_(734g) 0.643763 0.000000e+00 0.000000e+00
4 TCF4 TCF4_extended_+_-_(276g) 0.637050 0.000000e+00 0.000000e+00

We can plot eRegulon enrichment versus TF expression for each pseudobulk.

In [110]:
# Region based
%matplotlib inline
import seaborn as sns
sns.set_style("white")
colors = ["#E9842C","#F8766D", "#BC9D00", "#00C0B4", "#9CA700", "#6FB000", "#00B813", "#00BD61", "#00C08E", "#00BDD4",
           "#00A7FF", "#7F96FF", "#E26EF7", "#FF62BF", "#D69100", "#BC81FF","#E9842C","#F8766D", "#BC9D00", "#00C0B4", "#9CA700", "#6FB000", "#00B813", "#00BD61", "#00C08E", "#00BDD4",
           "#00A7FF", "#7F96FF", "#E26EF7", "#FF62BF", "#D69100", "#BC81FF","#E9842C","#F8766D", "#BC9D00", "#00C0B4", "#9CA700", "#6FB000", "#00B813", "#00BD61", "#00C08E", "#00BDD4",
           "#00A7FF", "#7F96FF", "#E26EF7", "#FF62BF", "#D69100", "#BC81FF","#E9842C","#F8766D", "#BC9D00", "#00C0B4", "#9CA700", "#6FB000", "#00B813", "#00BD61", "#00C08E", "#00BDD4",
           "#00A7FF", "#7F96FF", "#E26EF7", "#FF62BF", "#D69100", "#BC81FF","#E9842C","#F8766D", "#BC9D00", "#00C0B4", "#9CA700", "#6FB000", "#00B813", "#00BD61", "#00C08E", "#00BDD4",
           "#00A7FF", "#7F96FF", "#E26EF7", "#FF62BF", "#D69100", "#BC81FF","#E9842C","#F8766D", "#BC9D00", "#00C0B4", "#9CA700","#E9842C","#F8766D", "#BC9D00", "#00C0B4", "#9CA700",
         "#E9842C","#F8766D", "#BC9D00", "#00C0B4", "#9CA700"]
categories = sorted(set(scplus_obj.metadata_cell['GEX_cell.type']))
color_dict = dict(zip(categories, colors[0:len(categories)]))
In [111]:
len(colors)
Out[111]:
95
In [51]:
dgem = scplus_obj.uns['Pseudobulk']['GEX_cell.type']['Expression'].copy()
In [60]:
dgem
Out[60]:
non-dorsal margin(dome)_0 non-dorsal margin(dome)_1 non-dorsal margin(dome)_2 non-dorsal margin(dome)_3 non-dorsal margin(dome)_4 non-dorsal margin(dome)_5 non-dorsal margin(dome)_6 non-dorsal margin(dome)_7 non-dorsal margin(dome)_8 non-dorsal margin(dome)_9 ... dorsal posterior(50epi)_90 dorsal posterior(50epi)_91 dorsal posterior(50epi)_92 dorsal posterior(50epi)_93 dorsal posterior(50epi)_94 dorsal posterior(50epi)_95 dorsal posterior(50epi)_96 dorsal posterior(50epi)_97 dorsal posterior(50epi)_98 dorsal posterior(50epi)_99
lrmp 1.144091 2.621850 3.716425 2.382706 0.000000 0.000000 1.194217 3.653447 1.121054 3.740444 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 2.248244
h1m 0.000000 0.000000 1.057777 0.000000 2.428852 0.000000 0.000000 1.148690 0.000000 0.000000 ... 1.180652 0.000000 0.000000 0.000000 1.144152 0.000000 0.000000 0.000000 0.000000 1.159197
buc 0.000000 0.000000 1.123443 0.000000 0.000000 1.260509 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
cldng 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
nanos3 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
si:dkey-37g12.1 1.135956 0.000000 2.235393 0.000000 2.396255 0.000000 0.000000 0.000000 0.983159 1.147330 ... 1.363163 1.351154 1.110953 0.000000 1.144152 0.000000 2.371025 0.000000 1.115729 1.121162
polrmt 0.000000 2.126545 0.000000 2.399753 0.000000 1.260509 0.000000 1.103503 3.620539 1.301960 ... 1.090825 0.000000 0.000000 2.367095 2.276762 2.577747 0.000000 0.000000 0.000000 1.127082
prkcbp1l 1.258668 3.303022 1.396969 1.066149 3.417448 0.000000 1.332591 3.599749 1.208858 1.576552 ... 1.180652 3.851826 4.047869 5.194694 3.288702 1.332825 3.599356 1.226213 2.785311 2.643868
ap2a1 1.282392 2.100446 1.276825 3.490016 1.066149 2.567899 1.307390 3.376952 0.000000 0.000000 ... 2.389834 4.792479 1.110953 0.000000 3.427004 2.427370 3.824031 0.000000 1.115729 2.418685
ap2m1a 0.000000 0.000000 0.000000 0.000000 1.315681 1.281786 1.194217 0.000000 0.000000 0.000000 ... 1.180652 1.212757 2.232278 2.405485 1.132610 1.246034 0.000000 0.000000 0.000000 0.000000

6054 rows × 9500 columns

In [57]:
pseudobulk_variable = 'GEX_cell.type'
signature_key = 'Region_based'
auc_key = 'eRegulon_AUC'
In [58]:
cistromes_auc = scplus_obj.uns['Pseudobulk'][pseudobulk_variable][auc_key][signature_key].copy()
In [59]:
cistromes_auc
Out[59]:
non-dorsal margin(dome)_0 non-dorsal margin(dome)_1 non-dorsal margin(dome)_2 non-dorsal margin(dome)_3 non-dorsal margin(dome)_4 non-dorsal margin(dome)_5 non-dorsal margin(dome)_6 non-dorsal margin(dome)_7 non-dorsal margin(dome)_8 non-dorsal margin(dome)_9 ... dorsal posterior(50epi)_90 dorsal posterior(50epi)_91 dorsal posterior(50epi)_92 dorsal posterior(50epi)_93 dorsal posterior(50epi)_94 dorsal posterior(50epi)_95 dorsal posterior(50epi)_96 dorsal posterior(50epi)_97 dorsal posterior(50epi)_98 dorsal posterior(50epi)_99
TCF4_+_+_(2024r) 0.016235 0.014202 0.014010 0.014653 0.014376 0.013424 0.015316 0.013963 0.014099 0.013052 ... 0.016175 0.017039 0.016454 0.016426 0.017534 0.017502 0.017513 0.017611 0.017736 0.017642
TCF4_+_-_(340r) 0.168502 0.182706 0.180257 0.176234 0.194743 0.169320 0.194459 0.179392 0.175435 0.182556 ... 0.201456 0.206847 0.199728 0.212204 0.209357 0.199092 0.211413 0.218812 0.212306 0.204617
TCF4_-_+_(34r) 0.071814 0.076902 0.074792 0.078115 0.072811 0.075210 0.088982 0.070608 0.075925 0.071700 ... 0.075373 0.080140 0.068849 0.081392 0.088094 0.080956 0.085465 0.084446 0.093512 0.080089
TCF4_extended_+_+_(2024r) 0.016235 0.014202 0.014010 0.014653 0.014376 0.013424 0.015316 0.013963 0.014099 0.013052 ... 0.016175 0.017039 0.016454 0.016426 0.017534 0.017502 0.017513 0.017611 0.017736 0.017642
TCF4_extended_+_-_(340r) 0.168502 0.182706 0.180257 0.176234 0.194743 0.169320 0.194459 0.179392 0.175435 0.182556 ... 0.201456 0.206847 0.199728 0.212204 0.209357 0.199092 0.211413 0.218812 0.212306 0.204617
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
zic2b_extended_-_-_(58r) 0.212589 0.251023 0.225907 0.222842 0.252322 0.221213 0.222670 0.251683 0.237691 0.250392 ... 0.231756 0.249851 0.228332 0.244389 0.238694 0.230526 0.235299 0.239331 0.231912 0.212523
zic3_+_+_(17r) 0.069496 0.080841 0.074363 0.067693 0.081880 0.067678 0.068008 0.091310 0.072010 0.099404 ... 0.077506 0.085768 0.092735 0.077251 0.082145 0.094189 0.089013 0.067954 0.086340 0.069998
zic3_extended_+_+_(17r) 0.069496 0.080841 0.074363 0.067693 0.081880 0.067678 0.068008 0.091310 0.072010 0.099404 ... 0.077506 0.085768 0.092735 0.077251 0.082145 0.094189 0.089013 0.067954 0.086340 0.069998
znf598_+_-_(506r) 0.227506 0.253403 0.237788 0.233916 0.257681 0.227425 0.246402 0.245454 0.237513 0.245152 ... 0.269148 0.287723 0.281602 0.285786 0.294475 0.284463 0.289237 0.276735 0.290449 0.280056
znf598_extended_+_-_(506r) 0.227506 0.253403 0.237788 0.233916 0.257681 0.227425 0.246402 0.245454 0.237513 0.245152 ... 0.269148 0.287723 0.281602 0.285786 0.294475 0.284463 0.289237 0.276735 0.290449 0.280056

486 rows × 9500 columns

In [133]:
cistromes_auc.index.values
Out[133]:
array(['TCF4_+_+_(2024r)', 'TCF4_+_-_(340r)', 'TCF4_-_+_(34r)',
       'TCF4_extended_+_+_(2024r)', 'TCF4_extended_+_-_(340r)',
       'TCF4_extended_-_+_(34r)', 'atf3_+_+_(33r)',
       'atf3_extended_+_+_(33r)', 'atf6_+_-_(1116r)',
       'atf6_extended_+_-_(1116r)', 'atf7a_+_-_(36r)',
       'atf7a_extended_+_-_(36r)', 'bhlha15_+_+_(79r)',
       'bhlha15_+_-_(125r)', 'bhlha15_extended_+_+_(79r)',
       'bhlha15_extended_+_-_(125r)', 'bhlhe41_+_+_(65r)',
       'bhlhe41_+_-_(57r)', 'bhlhe41_extended_+_+_(65r)',
       'bhlhe41_extended_+_-_(57r)', 'cebpb_+_+_(942r)',
       'cebpb_-_-_(43r)', 'cebpb_extended_+_+_(942r)',
       'cebpb_extended_-_-_(43r)', 'creb3l2_+_+_(10r)',
       'creb3l2_extended_+_+_(10r)', 'ctcf_+_+_(502r)', 'ctcf_+_-_(143r)',
       'ctcf_-_+_(29r)', 'ctcf_-_-_(32r)', 'ctcf_extended_+_+_(502r)',
       'ctcf_extended_+_-_(143r)', 'ctcf_extended_-_+_(29r)',
       'ctcf_extended_-_-_(32r)', 'dmbx1a_+_+_(31r)',
       'dmbx1a_extended_+_+_(31r)', 'e2f3_+_+_(27r)', 'e2f3_+_-_(9r)',
       'e2f3_extended_+_+_(27r)', 'e2f3_extended_+_-_(9r)',
       'e2f5_+_+_(21r)', 'e2f5_extended_+_+_(21r)', 'e2f7_+_+_(679r)',
       'e2f7_extended_+_+_(679r)', 'ebf3b_+_-_(10r)',
       'ebf3b_extended_+_-_(10r)', 'egr2a_+_+_(88r)',
       'egr2a_extended_+_+_(88r)', 'egr2b_+_+_(124r)',
       'egr2b_extended_+_+_(124r)', 'elf1_+_+_(67r)', 'elf1_+_-_(207r)',
       'elf1_-_+_(39r)', 'elf1_-_-_(20r)', 'elf1_extended_+_+_(67r)',
       'elf1_extended_+_-_(207r)', 'elf1_extended_-_+_(39r)',
       'elf1_extended_-_-_(20r)', 'elk3_+_-_(41r)',
       'elk3_extended_+_-_(41r)', 'en2a_+_+_(22r)',
       'en2a_extended_+_+_(22r)', 'erf_+_+_(78r)', 'erf_+_-_(81r)',
       'erf_-_+_(14r)', 'erf_extended_+_+_(78r)',
       'erf_extended_+_-_(81r)', 'erf_extended_-_+_(14r)',
       'erfl3_+_+_(93r)', 'erfl3_+_-_(72r)', 'erfl3_extended_+_+_(93r)',
       'erfl3_extended_+_-_(72r)', 'ets2_+_+_(17r)', 'ets2_+_-_(19r)',
       'ets2_extended_+_+_(17r)', 'ets2_extended_+_-_(19r)',
       'etv5a_+_+_(123r)', 'etv5a_+_-_(77r)', 'etv5a_extended_+_+_(123r)',
       'etv5a_extended_+_-_(77r)', 'etv5b_+_+_(166r)', 'etv5b_+_-_(105r)',
       'etv5b_extended_+_+_(166r)', 'etv5b_extended_+_-_(105r)',
       'fli1a_+_+_(144r)', 'fli1a_+_-_(18r)', 'fli1a_extended_+_+_(144r)',
       'fli1a_extended_+_-_(18r)', 'foxa2_+_+_(52r)',
       'foxa2_extended_+_+_(52r)', 'foxa_+_+_(109r)', 'foxa_+_-_(10r)',
       'foxa_extended_+_+_(109r)', 'foxa_extended_+_-_(10r)',
       'foxd3_+_+_(25r)', 'foxd3_extended_+_+_(25r)', 'foxg1a_+_+_(75r)',
       'foxg1a_+_-_(19r)', 'foxg1a_extended_+_+_(75r)',
       'foxg1a_extended_+_-_(14r)', 'foxo1a_+_+_(30r)',
       'foxo1a_extended_+_+_(30r)', 'foxp2_+_+_(25r)',
       'foxp2_extended_+_+_(25r)', 'gata2a_+_+_(778r)',
       'gata2a_-_+_(64r)', 'gata2a_-_-_(44r)',
       'gata2a_extended_+_+_(778r)', 'gata2a_extended_-_+_(64r)',
       'gata2a_extended_-_-_(44r)', 'gata4_+_+_(524r)', 'gata4_+_-_(48r)',
       'gata4_extended_+_+_(524r)', 'gata4_extended_+_-_(48r)',
       'gata5_+_+_(54r)', 'gata5_extended_+_+_(54r)', 'gata6_+_+_(381r)',
       'gata6_+_-_(25r)', 'gata6_extended_+_+_(381r)',
       'gata6_extended_+_-_(25r)', 'gli1_+_+_(80r)',
       'gli1_extended_+_+_(80r)', 'gli2b_+_+_(55r)',
       'gli2b_extended_+_+_(55r)', 'gli3_+_+_(22r)',
       'gli3_extended_+_+_(22r)', 'glis1b_+_+_(25r)', 'glis1b_+_-_(14r)',
       'glis1b_extended_+_+_(20r)', 'glis1b_extended_+_-_(14r)',
       'grhl1_+_+_(7182r)', 'grhl1_+_-_(270r)', 'grhl1_-_+_(121r)',
       'grhl1_-_-_(613r)', 'grhl1_extended_+_+_(7182r)',
       'grhl1_extended_+_-_(270r)', 'grhl1_extended_-_+_(121r)',
       'grhl1_extended_-_-_(613r)', 'gsc_+_+_(83r)', 'gsc_+_-_(24r)',
       'gsc_extended_+_+_(83r)', 'gsc_extended_+_-_(28r)',
       'her6_+_+_(521r)', 'her6_+_-_(623r)', 'her6_-_+_(20r)',
       'her6_extended_+_+_(521r)', 'her6_extended_+_-_(623r)',
       'her6_extended_-_+_(20r)', 'her9_+_+_(502r)', 'her9_+_-_(77r)',
       'her9_extended_+_+_(502r)', 'her9_extended_+_-_(77r)',
       'hey1_+_+_(1325r)', 'hey1_+_-_(157r)', 'hey1_extended_+_+_(1325r)',
       'hey1_extended_+_-_(157r)', 'hic1_+_+_(612r)',
       'hic1_extended_+_+_(612r)', 'hnf1ba_+_+_(74r)',
       'hnf1ba_extended_+_+_(74r)', 'hnf4a_+_+_(756r)', 'hnf4a_+_-_(35r)',
       'hnf4a_-_-_(18r)', 'hnf4a_extended_+_+_(756r)',
       'hnf4a_extended_+_-_(35r)', 'hnf4a_extended_-_-_(18r)',
       'hnf4b_+_+_(834r)', 'hnf4b_+_-_(45r)', 'hnf4b_extended_+_+_(834r)',
       'hnf4b_extended_+_-_(45r)', 'hoxa9a_+_+_(86r)',
       'hoxa9a_extended_+_+_(86r)', 'hoxc1a_+_+_(13r)',
       'hoxc1a_extended_+_+_(13r)', 'hoxc9a_+_+_(55r)',
       'hoxc9a_extended_+_+_(55r)', 'klf12a_+_+_(21r)',
       'klf12a_extended_+_+_(21r)', 'klf13_+_+_(49r)', 'klf13_+_-_(26r)',
       'klf13_extended_+_+_(49r)', 'klf13_extended_+_-_(26r)',
       'klf3_+_-_(18r)', 'klf3_extended_+_-_(18r)', 'klf4_+_+_(1451r)',
       'klf4_+_-_(306r)', 'klf4_extended_+_+_(1451r)',
       'klf4_extended_+_-_(306r)', 'klf6a_+_+_(66r)', 'klf6a_+_-_(355r)',
       'klf6a_-_+_(72r)', 'klf6a_-_-_(18r)', 'klf6a_extended_+_+_(66r)',
       'klf6a_extended_+_-_(355r)', 'klf6a_extended_-_+_(72r)',
       'klf6a_extended_-_-_(18r)', 'klf8_+_+_(54r)', 'klf8_+_-_(15r)',
       'klf8_extended_+_+_(54r)', 'klf8_extended_+_-_(15r)',
       'mafa_+_-_(19r)', 'mafa_extended_+_-_(19r)', 'meis3_+_+_(37r)',
       'meis3_extended_+_+_(37r)', 'mespaa_+_+_(33r)',
       'mespaa_extended_+_+_(33r)', 'mespab_+_+_(30r)',
       'mespab_extended_+_+_(30r)', 'mlxipl_+_+_(19r)',
       'mlxipl_extended_+_+_(19r)', 'mxi1_+_+_(21r)', 'mxi1_+_-_(56r)',
       'mxi1_extended_+_+_(21r)', 'mxi1_extended_+_-_(56r)',
       'mych_+_+_(11r)', 'mych_+_-_(12r)', 'mych_extended_+_+_(11r)',
       'mych_extended_+_-_(12r)', 'mycn_+_+_(33r)',
       'mycn_extended_+_+_(33r)', 'myod1_+_+_(114r)',
       'myod1_extended_+_+_(114r)', 'neurod1_+_+_(17r)',
       'neurod1_extended_+_+_(17r)', 'nfya_+_+_(97r)', 'nfya_+_-_(31r)',
       'nfya_extended_+_+_(97r)', 'nfya_extended_+_-_(47r)',
       'nr2f2_+_+_(70r)', 'nr2f2_extended_+_+_(70r)', 'nr2f6b_+_+_(258r)',
       'nr2f6b_+_-_(76r)', 'nr2f6b_extended_+_+_(258r)',
       'nr2f6b_extended_+_-_(76r)', 'nr6a1b_+_+_(75r)',
       'nr6a1b_+_-_(31r)', 'nr6a1b_extended_+_+_(75r)',
       'nr6a1b_extended_+_-_(31r)', 'onecut1_+_+_(3304r)',
       'onecut1_+_-_(45r)', 'onecut1_-_-_(36r)',
       'onecut1_extended_+_+_(3304r)', 'onecut1_extended_+_-_(45r)',
       'onecut1_extended_-_-_(36r)', 'otx1_+_+_(361r)', 'otx1_+_-_(67r)',
       'otx1_-_+_(24r)', 'otx1_extended_+_+_(361r)',
       'otx1_extended_+_-_(67r)', 'otx1_extended_-_+_(24r)',
       'otx2a_+_+_(301r)', 'otx2a_+_-_(77r)', 'otx2a_-_+_(29r)',
       'otx2a_extended_+_+_(301r)', 'otx2a_extended_+_-_(77r)',
       'otx2a_extended_-_+_(29r)', 'otx2b_+_+_(83r)',
       'otx2b_extended_+_+_(83r)', 'pitx2_+_+_(31r)',
       'pitx2_extended_+_+_(31r)', 'pknox2_+_+_(23r)',
       'pknox2_extended_+_+_(23r)', 'pou2f2a_+_-_(70r)',
       'pou2f2a_extended_+_-_(70r)', 'prox1a_+_+_(88r)',
       'prox1a_extended_+_+_(88r)', 'rest_+_+_(11r)', 'rest_+_-_(67r)',
       'rest_extended_+_+_(11r)', 'rest_extended_+_-_(67r)',
       'rfx1a_+_+_(20r)', 'rfx1a_extended_+_+_(33r)', 'rfx2_+_+_(107r)',
       'rfx2_+_-_(47r)', 'rfx2_extended_+_+_(107r)',
       'rfx2_extended_+_-_(47r)', 'rfx3_+_+_(387r)',
       'rfx3_extended_+_+_(387r)', 'rfx4_+_+_(173r)',
       'rfx4_extended_+_+_(173r)', 'rreb1a_+_+_(648r)',
       'rreb1a_+_-_(813r)', 'rreb1a_-_-_(14r)',
       'rreb1a_extended_+_+_(648r)', 'rreb1a_extended_+_-_(813r)',
       'rreb1a_extended_-_-_(14r)', 'rreb1b_+_+_(1219r)',
       'rreb1b_-_+_(28r)', 'rreb1b_-_-_(20r)',
       'rreb1b_extended_+_+_(1219r)', 'rreb1b_extended_-_+_(28r)',
       'rreb1b_extended_-_-_(20r)', 'rxraa_+_+_(151r)', 'rxraa_-_+_(13r)',
       'rxraa_extended_+_+_(151r)', 'rxraa_extended_-_+_(13r)',
       'rxrgb_+_+_(375r)', 'rxrgb_+_-_(41r)', 'rxrgb_extended_+_+_(375r)',
       'rxrgb_extended_+_-_(41r)', 'six3b_+_-_(17r)',
       'six3b_extended_+_-_(17r)', 'six7_+_-_(13r)',
       'six7_extended_+_-_(13r)', 'snai1a_+_+_(79r)', 'snai1a_+_-_(40r)',
       'snai1a_extended_+_+_(79r)', 'snai1a_extended_+_-_(40r)',
       'sox19a_+_+_(59r)', 'sox19a_extended_+_+_(59r)',
       'sox19b_+_+_(58r)', 'sox19b_extended_+_+_(58r)', 'sox2_+_+_(56r)',
       'sox2_extended_+_+_(56r)', 'sox3_+_+_(70r)',
       'sox3_extended_+_+_(70r)', 'sox3_extended_+_-_(17r)',
       'sox9a_+_+_(17r)', 'sox9a_extended_+_+_(17r)', 'sox9b_+_-_(14r)',
       'sox9b_extended_+_-_(14r)', 'stat5a_+_-_(62r)',
       'stat5a_extended_+_-_(62r)', 'tbx16_+_+_(363r)', 'tbx16_+_-_(39r)',
       'tbx16_-_+_(27r)', 'tbx16_-_-_(72r)', 'tbx16_extended_+_+_(363r)',
       'tbx16_extended_+_-_(39r)', 'tbx16_extended_-_+_(27r)',
       'tbx16_extended_-_-_(72r)', 'tbx16l_+_+_(198r)',
       'tbx16l_+_-_(20r)', 'tbx16l_-_+_(14r)', 'tbx16l_-_-_(15r)',
       'tbx16l_extended_+_+_(198r)', 'tbx16l_extended_+_-_(21r)',
       'tbx16l_extended_-_+_(14r)', 'tbx16l_extended_-_-_(15r)',
       'tbx19_+_+_(55r)', 'tbx19_+_-_(12r)', 'tbx19_extended_+_+_(55r)',
       'tbx19_extended_+_-_(12r)', 'tbx1_+_+_(503r)', 'tbx1_+_-_(115r)',
       'tbx1_extended_+_+_(503r)', 'tbx1_extended_+_-_(115r)',
       'tbx20_+_+_(11r)', 'tbx20_extended_+_+_(11r)', 'tbx2a_+_-_(21r)',
       'tbx2a_extended_+_-_(21r)', 'tbx2b_+_+_(33r)', 'tbx2b_+_-_(63r)',
       'tbx2b_extended_+_+_(33r)', 'tbx2b_extended_+_-_(63r)',
       'tbx5a_+_-_(19r)', 'tbx5a_extended_+_-_(19r)', 'tbxta_+_+_(1129r)',
       'tbxta_+_-_(115r)', 'tbxta_-_+_(113r)', 'tbxta_-_-_(75r)',
       'tbxta_extended_+_+_(1129r)', 'tbxta_extended_+_-_(115r)',
       'tbxta_extended_-_+_(113r)', 'tbxta_extended_-_-_(75r)',
       'tbxtb_+_+_(870r)', 'tbxtb_-_-_(19r)', 'tbxtb_extended_+_+_(870r)',
       'tbxtb_extended_-_-_(19r)', 'tcf12_+_+_(1197r)', 'tcf12_-_+_(37r)',
       'tcf12_-_-_(17r)', 'tcf12_extended_+_+_(1197r)',
       'tcf12_extended_-_+_(37r)', 'tcf12_extended_-_-_(17r)',
       'tcf3a_+_+_(335r)', 'tcf3a_+_-_(432r)', 'tcf3a_-_-_(12r)',
       'tcf3a_extended_+_+_(335r)', 'tcf3a_extended_+_-_(432r)',
       'tcf3b_+_+_(556r)', 'tcf3b_+_-_(127r)',
       'tcf3b_extended_+_+_(556r)', 'tcf3b_extended_+_-_(127r)',
       'tcf7_+_+_(31r)', 'tcf7_extended_+_+_(31r)', 'tcf7l1a_+_+_(92r)',
       'tcf7l1a_+_-_(28r)', 'tcf7l1a_-_+_(64r)',
       'tcf7l1a_extended_+_+_(92r)', 'tcf7l1a_extended_+_-_(28r)',
       'tcf7l1a_extended_-_+_(64r)', 'tcf7l1b_+_+_(60r)',
       'tcf7l1b_+_-_(30r)', 'tcf7l1b_-_+_(83r)',
       'tcf7l1b_extended_+_+_(60r)', 'tcf7l1b_extended_+_-_(30r)',
       'tcf7l1b_extended_-_+_(83r)', 'tcf7l2_+_+_(122r)',
       'tcf7l2_+_-_(34r)', 'tcf7l2_extended_+_+_(122r)',
       'tcf7l2_extended_+_-_(35r)', 'tead1a_+_+_(199r)',
       'tead1a_+_-_(229r)', 'tead1a_extended_+_+_(199r)',
       'tead1a_extended_+_-_(229r)', 'tead1b_+_+_(435r)',
       'tead1b_+_-_(59r)', 'tead1b_-_-_(13r)',
       'tead1b_extended_+_+_(435r)', 'tead1b_extended_+_-_(59r)',
       'tead1b_extended_-_-_(13r)', 'tead3a_+_+_(1530r)',
       'tead3a_+_-_(100r)', 'tead3a_extended_+_+_(1530r)',
       'tead3a_extended_+_-_(100r)', 'tead3b_+_+_(1830r)',
       'tead3b_+_-_(99r)', 'tead3b_extended_+_+_(1830r)',
       'tead3b_extended_+_-_(99r)', 'tefa_+_+_(128r)', 'tefa_+_-_(134r)',
       'tefa_extended_+_+_(128r)', 'tefa_extended_+_-_(134r)',
       'tfap2a_+_+_(2300r)', 'tfap2a_+_-_(658r)', 'tfap2a_-_+_(126r)',
       'tfap2a_-_-_(74r)', 'tfap2a_extended_+_+_(2300r)',
       'tfap2a_extended_+_-_(641r)', 'tfap2a_extended_-_+_(126r)',
       'tfap2a_extended_-_-_(74r)', 'tfap2c_+_+_(2564r)',
       'tfap2c_+_-_(136r)', 'tfap2c_-_+_(173r)', 'tfap2c_-_-_(67r)',
       'tfap2c_extended_+_+_(2564r)', 'tfap2c_extended_+_-_(136r)',
       'tfap2c_extended_-_+_(173r)', 'tfap2c_extended_-_-_(58r)',
       'tfap2e_+_-_(49r)', 'tfap2e_extended_+_-_(49r)',
       'tfap4_+_-_(400r)', 'tfap4_-_+_(76r)', 'tfap4_extended_+_-_(400r)',
       'tfap4_extended_-_+_(76r)', 'tfeb_+_+_(44r)', 'tfeb_+_-_(32r)',
       'tfeb_extended_+_+_(44r)', 'tfeb_extended_+_-_(56r)',
       'tp53_+_+_(78r)', 'tp53_extended_+_+_(78r)', 'vdrb_+_-_(19r)',
       'vdrb_extended_+_-_(19r)', 'wt1a_+_+_(70r)', 'wt1a_+_-_(39r)',
       'wt1a_extended_+_+_(70r)', 'wt1a_extended_+_-_(39r)',
       'ybx1_+_+_(47r)', 'ybx1_+_-_(21r)', 'ybx1_extended_+_+_(47r)',
       'ybx1_extended_+_-_(21r)', 'zbtb18_+_+_(876r)', 'zbtb18_+_-_(28r)',
       'zbtb18_extended_+_+_(876r)', 'zbtb18_extended_+_-_(27r)',
       'zeb1a_+_+_(10r)', 'zeb1a_extended_+_+_(10r)', 'zic1_+_+_(91r)',
       'zic1_extended_+_+_(91r)', 'zic2a_+_+_(413r)', 'zic2a_+_-_(64r)',
       'zic2a_extended_+_+_(413r)', 'zic2a_extended_+_-_(64r)',
       'zic2b_+_+_(229r)', 'zic2b_+_-_(88r)', 'zic2b_-_+_(31r)',
       'zic2b_-_-_(58r)', 'zic2b_extended_+_+_(229r)',
       'zic2b_extended_+_-_(88r)', 'zic2b_extended_-_+_(31r)',
       'zic2b_extended_-_-_(58r)', 'zic3_+_+_(17r)',
       'zic3_extended_+_+_(17r)', 'znf598_+_-_(506r)',
       'znf598_extended_+_-_(506r)'], dtype=object)
In [61]:
cell_data = pd.DataFrame([x.rsplit('_', 1)[0] for x in cistromes_auc.columns], index=cistromes_auc.columns).iloc[:, 0]
In [62]:
cell_data
Out[62]:
non-dorsal margin(dome)_0     non-dorsal margin(dome)
non-dorsal margin(dome)_1     non-dorsal margin(dome)
non-dorsal margin(dome)_2     non-dorsal margin(dome)
non-dorsal margin(dome)_3     non-dorsal margin(dome)
non-dorsal margin(dome)_4     non-dorsal margin(dome)
                                       ...           
dorsal posterior(50epi)_95    dorsal posterior(50epi)
dorsal posterior(50epi)_96    dorsal posterior(50epi)
dorsal posterior(50epi)_97    dorsal posterior(50epi)
dorsal posterior(50epi)_98    dorsal posterior(50epi)
dorsal posterior(50epi)_99    dorsal posterior(50epi)
Name: 0, Length: 9500, dtype: object
In [97]:
name='TCF4_+_+'
In [98]:
name.split('_')[0]
Out[98]:
'TCF4'
In [99]:
tf_expr = dgem.loc[name.split('_')[0], :]
        
In [100]:
tf_acc = cistromes_auc.index[cistromes_auc.index.str.contains(name + '_(', regex=False)][0]
In [95]:
cistromes_auc.index
Out[95]:
Index(['TCF4_+_+_(2024r)', 'TCF4_+_-_(340r)', 'TCF4_-_+_(34r)',
       'TCF4_extended_+_+_(2024r)', 'TCF4_extended_+_-_(340r)',
       'TCF4_extended_-_+_(34r)', 'atf3_+_+_(33r)', 'atf3_extended_+_+_(33r)',
       'atf6_+_-_(1116r)', 'atf6_extended_+_-_(1116r)',
       ...
       'zic2b_-_+_(31r)', 'zic2b_-_-_(58r)', 'zic2b_extended_+_+_(229r)',
       'zic2b_extended_+_-_(88r)', 'zic2b_extended_-_+_(31r)',
       'zic2b_extended_-_-_(58r)', 'zic3_+_+_(17r)', 'zic3_extended_+_+_(17r)',
       'znf598_+_-_(506r)', 'znf598_extended_+_-_(506r)'],
      dtype='object', length=486)
In [101]:
cistromes_auc_tf = cistromes_auc.loc[tf_acc, :]
In [112]:
prune_plot(scplus_obj,
           'TCF4_+_+',
           pseudobulk_variable = 'GEX_cell.type',
           show_dot_plot = True,
           show_line_plot = False,
           color_dict = color_dict,
           use_pseudobulk = True,
           auc_key = 'eRegulon_AUC',
           signature_key = 'Region_based',
           seed=555)
In [113]:
# Gene based
prune_plot(scplus_obj,
           'TCF4_+_+',
           pseudobulk_variable = 'GEX_cell.type',
           show_dot_plot = True,
           show_line_plot = False,
           color_dict = color_dict,
           use_pseudobulk = True,
           auc_key = 'eRegulon_AUC',
           signature_key = 'Gene_based',
           seed=555)
In [58]:
# output 
infile = open(outDir + 'scenicplus/zf_highToSomite_scplusObject_withBinarizedAUC.pkl', 'rb')
scplus_obj = dill.load(infile) # (can't use pickle, only dill)
infile.close()
In [59]:
scplus_obj.uns.keys()
Out[59]:
dict_keys(['Cistromes', 'search_space', 'region_to_gene', 'TF2G_adj', 'eRegulons_importance', 'eRegulon_metadata', 'eRegulon_signatures', 'eRegulon_AUC', 'Pseudobulk', 'TF_cistrome_correlation', 'selected_eRegulons', 'eRegulon_AUC_thresholds'])
In [75]:
# TF_cistrome_correlation (eGRN_gene_based)
scplus_obj.uns['TF_cistrome_correlation']['GEX_cell.type_eGRN_gene_based'].to_csv("output/scenicplus/TF_cistrome_gene_based_correlation.txt",sep ='\t',index=False,header=True)
In [77]:
# TF_cistrome_correlation (eGRN__region_based)
scplus_obj.uns['TF_cistrome_correlation']['GEX_cell.type_eGRN__region_based'].to_csv("output/scenicplus/TF_cistrome_region_based_correlation.txt",sep ='\t',index=False,header=True)
D. Identification of high quality regulons¶

We will select a subset of regulons based on the correlation between the region based and the gene based regulons. We will only use the extended eRegulon if there is not a direct eRegulon available.

In [114]:
# correlation between region based regulons and gene based regulons
df1 = scplus_obj.uns['eRegulon_AUC']['Gene_based'].copy()
df2 = scplus_obj.uns['eRegulon_AUC']['Region_based'].copy()
df1.columns = [x.split('_(')[0] for x in df1.columns]
df2.columns = [x.split('_(')[0] for x in df2.columns]
correlations = df1.corrwith(df2, axis = 0)
correlations = correlations[abs(correlations) > 0.6] 
# kepp only R2G +
keep = [x for x in correlations.index if '+_+' in x] + [x for x in correlations.index if '-_+' in x]
# keep extended if not direct
extended = [x for x in keep if 'extended' in x]
direct = [x for x in keep if not 'extended' in x]
keep_extended = [x for x in extended if not x.replace('extended_', '') in direct]
keep = direct + keep_extended
# keep regulons with more than 10 genes
keep_gene = [x for x in scplus_obj.uns['eRegulon_AUC']['Gene_based'].columns if x.split('_(')[0] in keep]
keep_gene = [x for x in keep_gene if (int(x.split('_(')[1].replace('g)', '')) > 10)]
keep_all = [x.split('_(')[0] for x in keep_gene]
keep_region = [x for x in scplus_obj.uns['eRegulon_AUC']['Region_based'].columns if x.split('_(')[0] in keep]
scplus_obj.uns['selected_eRegulons'] = {}
scplus_obj.uns['selected_eRegulons']['Gene_based'] = keep_gene
scplus_obj.uns['selected_eRegulons']['Region_based'] = keep_region
E. Overlap between eRegulons¶
In [7]:
infile = open(outDir + 'scenicplus/zf_highToSomite_scplusObject_withRSS.pkl', 'rb')
scplus_obj = dill.load(infile) # (can't use pickle, only dill)
infile.close()
In [ ]:
correlation_heatmap(scplus_obj,
                    auc_key = 'eRegulon_AUC',
                    signature_keys = ['Gene_based'],
                    selected_regulons = scplus_obj.uns['selected_eRegulons']['Gene_based'],
                    fcluster_threshold = 0.1,
                    save = 'output/network_analysis_results/regulon_cluster.pdf',
                    fontsize = 3)
In [ ]:
# we can also check the overlap between eRegulons:
jaccard_heatmap(scplus_obj,
                    gene_or_region_based = 'Gene_based',
                    signature_key = 'eRegulon_signatures',
                    selected_regulons = scplus_obj.uns['selected_eRegulons']['Gene_based'],
                    fcluster_threshold = 0.1,
                    fontsize = 3,
                    save = 'output/network_analysis_results/regulon_overlap.pdf',
                    method='intersect')
In [ ]:
# we can also binarize the eRegulons as in SCENIC. This information will be used afterwards for generating the loom file
binarize_AUC(scplus_obj,
             auc_key='eRegulon_AUC',
             out_key='eRegulon_AUC_thresholds',
             signature_keys=['Gene_based', 'Region_based'],
             n_cpu=8)
In [117]:
with open(outDir + 'scenicplus/zf_highToSomite_scplusObject_withBinarizedAUC.pkl', 'wb') as f:
  dill.dump(scplus_obj, f)
F. eRegulon specificity scores¶
In [79]:
infile = open(outDir + 'scenicplus/zf_highToSomite_scplusObject_withBinarizedAUC.pkl', 'rb')
scplus_obj = dill.load(infile) # (can't use pickle, only dill)
infile.close()
In [126]:
regulon_specificity_scores(scplus_obj,
                         'GEX_cell.type',
                         signature_keys=['Gene_based'],
                         selected_regulons=scplus_obj.uns['selected_eRegulons']['Gene_based'],
                         out_key_suffix='_gene_based',
                         scale=False)
In [127]:
plot_rss(scplus_obj, 'GEX_cell.type_gene_based', num_columns=4, top_n=10,
        save='output/network_analysis_results/celltype_specific_regulon.pdf')
/scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/IPython/core/pylabtools.py:151: UserWarning:

This figure includes Axes that are not compatible with tight_layout, so results might be incorrect.

In [18]:
heatmap_dotplot(
        scplus_obj = scplus_obj,
        size_matrix = scplus_obj.uns['RSS']['GEX_cell.type_gene_based'],
        color_matrix = scplus_obj.to_df('EXP'),
        scale_size_matrix = True,
        scale_color_matrix = True,
        group_variable = 'GEX_cell.type',
        subset_eRegulons = scplus_obj.uns['selected_eRegulons']['Gene_based'],
        figsize = (25, 20),
        orientation = 'vertical',
        save = 'output/network_analysis_results/regulon_specificity.pdf',
        split_repressor_activator=True)
/scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/plotnine/ggplot.py:718: PlotnineWarning:

Saving 25 x 20 in image.

/scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/plotnine/ggplot.py:719: PlotnineWarning:

Filename: output/network_analysis_results/regulon_specificity.pdf

In [65]:
colnames = list()
for col in scplus_obj.uns['eRegulon_AUC']['Region_based'].columns:
    if 'extended' not in col and ('+_+' in col or '-_+' in col):
        colnames += [col]
In [67]:
len(colnames)
Out[67]:
141
In [68]:
scplus_obj.uns['eRegulon_AUC']['Region_based_withoutExtend'] = scplus_obj.uns['eRegulon_AUC']['Region_based'][colnames]
In [69]:
scplus_obj.uns['eRegulon_AUC']['Region_based_withoutExtend']
Out[69]:
TCF4_+_+_(2024r) TCF4_-_+_(34r) atf3_+_+_(33r) bhlha15_+_+_(79r) bhlhe41_+_+_(65r) cebpb_+_+_(942r) creb3l2_+_+_(10r) ctcf_+_+_(502r) ctcf_-_+_(29r) dmbx1a_+_+_(31r) ... tp53_+_+_(78r) wt1a_+_+_(70r) ybx1_+_+_(47r) zbtb18_+_+_(876r) zeb1a_+_+_(10r) zic1_+_+_(91r) zic2a_+_+_(413r) zic2b_+_+_(229r) zic2b_-_+_(31r) zic3_+_+_(17r)
Cell
30epi_AAACAGCCAGTTGCGT-1___cisTopic 0.236184 0.029294 0.232283 0.162929 0.122100 0.190718 0.123698 0.026957 0.019013 0.025652 ... 0.118459 0.036569 0.123082 0.017417 0.126264 0.016276 0.047055 0.099914 0.111129 0.003803
30epi_AAACCAACACCTAAGC-1___cisTopic 0.013967 0.064256 0.175003 0.072468 0.138885 0.004360 0.250777 0.053468 0.035071 0.047464 ... 0.016587 0.034934 0.162504 0.026205 0.237600 0.041107 0.103522 0.251155 0.102911 0.079095
30epi_AAACCGAAGGTTTGCG-1___cisTopic 0.014294 0.071337 0.175724 0.069385 0.142614 0.004268 0.261329 0.053448 0.024866 0.041074 ... 0.020542 0.038202 0.166890 0.028121 0.211475 0.042303 0.096785 0.214687 0.120824 0.055013
30epi_AAACGCGCAGTAAGTA-1___cisTopic 0.013777 0.056596 0.193737 0.081006 0.184027 0.004791 0.375361 0.047252 0.027643 0.049907 ... 0.030226 0.048837 0.149964 0.024393 0.216769 0.034169 0.107139 0.239600 0.100849 0.108058
30epi_AAAGCACCAGCATTAT-1___cisTopic 0.014109 0.058481 0.184313 0.071980 0.155580 0.003913 0.376766 0.046059 0.017559 0.044940 ... 0.033327 0.067920 0.175644 0.033080 0.279368 0.039814 0.104459 0.229825 0.137459 0.068201
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
shield_s4_TTTGTCTAGGTTTGAC-1___cisTopic 0.016303 0.082319 0.118115 0.077388 0.181197 0.004776 0.348147 0.036602 0.022523 0.041467 ... 0.038786 0.048248 0.136296 0.045564 0.145694 0.038615 0.097767 0.185086 0.204019 0.054160
shield_s4_TTTGTCTAGTCACCTC-1___cisTopic 0.014467 0.043104 0.131677 0.121333 0.191412 0.004147 0.507955 0.034547 0.031505 0.098749 ... 0.040940 0.116200 0.117108 0.056584 0.215721 0.120950 0.180552 0.304227 0.255366 0.217991
shield_s4_TTTGTCTAGTCAGGCC-1___cisTopic 0.016600 0.128103 0.099268 0.079432 0.146422 0.003817 0.302809 0.032908 0.006712 0.038093 ... 0.028671 0.060112 0.139824 0.122713 0.122987 0.045252 0.121687 0.185533 0.212812 0.052990
shield_s4_TTTGTGGCACATGCTA-1___cisTopic 0.013606 0.028060 0.143696 0.186219 0.194413 0.004218 0.459560 0.035174 0.016028 0.114225 ... 0.039883 0.096790 0.121368 0.042640 0.246653 0.117839 0.159517 0.267571 0.203638 0.161297
shield_s4_TTTGTTGGTATTGAGT-1___cisTopic 0.313955 0.030738 0.107686 0.213307 0.121744 0.249302 0.088372 0.022332 0.025721 0.022761 ... 0.140332 0.025315 0.081890 0.017971 0.050867 0.017883 0.030802 0.062230 0.122371 0.024670

40992 rows × 141 columns

In [71]:
heatmap_dotplot(
        scplus_obj = scplus_obj,
        size_matrix = scplus_obj.uns['eRegulon_AUC']['Region_based_withoutExtend'], #specify what to plot as dot sizes, target region enrichment in this case
        color_matrix = scplus_obj.to_df('EXP'), #specify  what to plot as colors, TF expression in this case
        scale_size_matrix = True,
        scale_color_matrix = True,
        group_variable = 'GEX_cell.type',
        subset_eRegulons = scplus_obj.uns['selected_eRegulons']['Gene_based'],
        figsize = (25, 20),
        orientation = 'vertical',
        save = 'output/network_analysis_results/regulon_enhancerActivity.pdf',

        split_repressor_activator=True)
/scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/plotnine/ggplot.py:718: PlotnineWarning:

Saving 25 x 20 in image.

/scicore/home/schiera/liu0005/miniconda3/envs/scenicplus/lib/python3.8/site-packages/plotnine/ggplot.py:719: PlotnineWarning:

Filename: output/network_analysis_results/regulon_enhancerActivity.pdf

In [30]:
scplus_obj.uns['RSS']['GEX_cell.type_gene_based']
Out[30]:
TCF4_+_+_(734g) bhlha15_+_+_(64g) cebpb_+_+_(556g) ctcf_+_+_(277g) ctcf_-_+_(22g) dmbx1a_+_+_(26g) egr2a_+_+_(35g) egr2b_+_+_(42g) elf1_+_+_(67g) elf1_-_+_(30g) ... tead3b_+_+_(782g) tfap2a_+_+_(545g) tfap2c_+_+_(485g) tfap2c_-_+_(73g) ybx1_+_+_(45g) zbtb18_+_+_(205g) zic1_+_+_(41g) zic2a_+_+_(140g) zic2b_+_+_(118g) zic3_+_+_(17g)
evl(30epi) 0.204734 0.173197 0.208967 0.170580 0.170599 0.168651 0.169338 0.170173 0.186495 0.168435 ... 0.189761 0.180227 0.170886 0.172020 0.169481 0.168878 0.168152 0.169417 0.170190 0.169013
dorsal margin(30epi) 0.172401 0.170434 0.172491 0.178957 0.168433 0.172360 0.170333 0.171769 0.173713 0.175362 ... 0.172646 0.171010 0.173579 0.173539 0.177311 0.170977 0.170828 0.172688 0.174987 0.171819
non-dorsal margin(30epi) 0.175760 0.174109 0.176001 0.186304 0.168291 0.174474 0.169623 0.171931 0.177755 0.178147 ... 0.175526 0.173970 0.178146 0.175478 0.185385 0.172035 0.171323 0.176345 0.179529 0.172820
ectoderm(30epi) 0.187082 0.176057 0.188877 0.218329 0.170317 0.185453 0.169627 0.173886 0.188271 0.201121 ... 0.188973 0.184852 0.196404 0.174310 0.212606 0.178858 0.179545 0.183302 0.204064 0.180442
apoptosis like(30epi) 0.168281 0.168059 0.168378 0.169877 0.167528 0.167728 0.167645 0.167763 0.168408 0.168871 ... 0.168435 0.168303 0.168909 0.167792 0.169683 0.167944 0.168182 0.168201 0.169103 0.167953
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
dorsal ectoderm(shield) 0.224567 0.209960 0.223933 0.282054 0.188495 0.232317 0.229591 0.219829 0.230558 0.285438 ... 0.230473 0.233360 0.254569 0.218797 0.283444 0.218498 0.234391 0.241405 0.280718 0.248107
endoderm(shield) 0.174806 0.178771 0.174685 0.178753 0.173080 0.177627 0.174959 0.177863 0.178295 0.176696 ... 0.175762 0.174788 0.177652 0.178361 0.178528 0.173511 0.173169 0.179302 0.175756 0.177557
evl(shield) 0.278154 0.188867 0.285436 0.171548 0.180985 0.168858 0.172374 0.172595 0.219142 0.169095 ... 0.250275 0.215063 0.177524 0.177564 0.172052 0.173389 0.174750 0.173603 0.173627 0.180771
ysl(shield) 0.170037 0.171708 0.171291 0.168991 0.169667 0.167780 0.168067 0.167683 0.169875 0.168787 ... 0.174798 0.169435 0.169761 0.169892 0.168143 0.168559 0.167910 0.168900 0.168028 0.171475
dorsal anterior(shield) 0.172001 0.182495 0.171776 0.173761 0.173514 0.178601 0.171212 0.175501 0.173359 0.171793 ... 0.173269 0.172551 0.173015 0.173874 0.171888 0.171918 0.173317 0.176316 0.172409 0.176351

95 rows × 84 columns

G. Export to loom¶
In [7]:
infile = open(outDir + 'scenicplus/zf_highToSomite_scplusObject_withRSS.pkl', 'rb')
scplus_obj = dill.load(infile) # (can't use pickle, only dill)
infile.close()
In [156]:
export_to_loom(scplus_obj,
               signature_key = 'Gene_based',
               tree_structure = ('10x_multiome_zebrafish', 'SCENIC+'),
               title = 'Tutorial - Gene based eGRN',
               nomenclature = "danRer11",
               out_fname=outDir + 'scenicplus/zebrafish_eGRN_gene_based.loom')
2022-11-09 22:19:28,505 SCENIC+      INFO     Formatting data
2022-11-09 22:19:31,483 SCENIC+      INFO     Creating minimal loom
2022-11-09 22:20:40,918 SCENIC+      INFO     Adding annotations
2022-11-09 22:20:44,780 SCENIC+      INFO     Exporting
In [ ]:
export_to_loom(scplus_obj,
               signature_key = 'Region_based',
               tree_structure = ('10x_multiome_zebrafish', 'SCENIC+'),
               title = 'Tutorial - Gene based eGRN',
               nomenclature = "danRer11",
               out_fname=outDir + 'scenicplus/zebrafish_eGRN_region_based.loom')
2022-11-09 22:36:43,375 SCENIC+      INFO     Formatting data
H. Plotting networks¶
In [82]:
(scplus_obj.uns['selected_eRegulons']['Gene_based'])
Out[82]:
['TCF4_+_+_(734g)',
 'bhlha15_+_+_(64g)',
 'cebpb_+_+_(556g)',
 'ctcf_+_+_(277g)',
 'ctcf_-_+_(22g)',
 'dmbx1a_+_+_(26g)',
 'egr2a_+_+_(35g)',
 'egr2b_+_+_(42g)',
 'elf1_+_+_(67g)',
 'elf1_-_+_(30g)',
 'en2a_+_+_(17g)',
 'erf_-_+_(14g)',
 'etv5b_+_+_(136g)',
 'fli1a_+_+_(61g)',
 'foxa_+_+_(85g)',
 'foxg1a_+_+_(29g)',
 'gata2a_+_+_(269g)',
 'gata4_+_+_(361g)',
 'gata5_+_+_(47g)',
 'gata6_+_+_(329g)',
 'gli1_+_+_(60g)',
 'gli2b_+_+_(53g)',
 'gli3_+_+_(20g)',
 'glis1b_+_+_(23g)',
 'grhl1_+_+_(976g)',
 'gsc_+_+_(53g)',
 'her6_+_+_(215g)',
 'her6_-_+_(12g)',
 'her9_+_+_(211g)',
 'hey1_+_+_(388g)',
 'hic1_+_+_(204g)',
 'hnf1ba_+_+_(71g)',
 'hnf4a_+_+_(565g)',
 'hnf4b_+_+_(603g)',
 'hoxa9a_+_+_(52g)',
 'hoxc9a_+_+_(33g)',
 'klf4_+_+_(647g)',
 'klf6a_+_+_(56g)',
 'klf6a_-_+_(40g)',
 'meis3_+_+_(32g)',
 'mespaa_+_+_(29g)',
 'myod1_+_+_(57g)',
 'nfya_+_+_(68g)',
 'nr2f2_+_+_(55g)',
 'nr2f6b_+_+_(184g)',
 'onecut1_+_+_(947g)',
 'otx1_+_+_(140g)',
 'otx2a_+_+_(154g)',
 'otx2b_+_+_(58g)',
 'pitx2_+_+_(32g)',
 'prox1a_+_+_(85g)',
 'rfx3_+_+_(204g)',
 'rfx4_+_+_(141g)',
 'rreb1b_+_+_(489g)',
 'rxraa_+_+_(86g)',
 'rxrgb_+_+_(259g)',
 'snai1a_+_+_(42g)',
 'sox19a_+_+_(44g)',
 'sox19b_+_+_(44g)',
 'sox2_+_+_(46g)',
 'sox3_+_+_(58g)',
 'tbx16_+_+_(239g)',
 'tbx16l_+_+_(119g)',
 'tbx1_+_+_(147g)',
 'tbxta_+_+_(275g)',
 'tbxtb_+_+_(221g)',
 'tcf12_+_+_(365g)',
 'tcf3b_+_+_(199g)',
 'tcf7_+_+_(22g)',
 'tcf7l1a_-_+_(35g)',
 'tcf7l1b_-_+_(45g)',
 'tcf7l2_+_+_(107g)',
 'tead1b_+_+_(301g)',
 'tead3a_+_+_(683g)',
 'tead3b_+_+_(782g)',
 'tfap2a_+_+_(545g)',
 'tfap2c_+_+_(485g)',
 'tfap2c_-_+_(73g)',
 'ybx1_+_+_(45g)',
 'zbtb18_+_+_(205g)',
 'zic1_+_+_(41g)',
 'zic2a_+_+_(140g)',
 'zic2b_+_+_(118g)',
 'zic3_+_+_(17g)']
In [ ]:
(scplus_obj.uns['selected_eRegulons']['Gene_based']).to_csv(outDir + 'scenicplus/eRegulon_name.csv',sep='\t',index=False)  
In [85]:
with open(outDir + 'scenicplus/eRegulon_name.csv', 'w') as fp:
    for item in (scplus_obj.uns['selected_eRegulons']['Gene_based']):
        # write each item on a new line
        fp.write("%s\n" % item)
    print('Done')
Done
In [27]:
scplus_obj.uns['RSS']['GEX_cell.type_gene_based'].to_csv(outDir + 'scenicplus/eRegulon_RSS.csv',sep='\t')